annotate arm/h264dsp_neon.S @ 9707:b69723e55653 libavcodec

Move a DECLARE_ALIGNED_16 variable in the Nellymoser encoder from the stack into the context to avoid issues when stack variables can not be aligned reliably.
author reimar
date Mon, 25 May 2009 12:17:02 +0000
parents d56b711c6c5d
children f5ffd813dc7f
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
1 /*
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
3 *
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
4 * This file is part of FFmpeg.
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
5 *
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
6 * FFmpeg is free software; you can redistribute it and/or
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
8 * License as published by the Free Software Foundation; either
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
9 * version 2.1 of the License, or (at your option) any later version.
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
10 *
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
11 * FFmpeg is distributed in the hope that it will be useful,
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
14 * Lesser General Public License for more details.
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
15 *
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
17 * License along with FFmpeg; if not, write to the Free Software
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
19 */
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
20
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
21 #include "asm.S"
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
22
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
23 .fpu neon
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
24
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
25 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
26 vtrn.32 \r0, \r4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
27 vtrn.32 \r1, \r5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
28 vtrn.32 \r2, \r6
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
29 vtrn.32 \r3, \r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
30 vtrn.16 \r0, \r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
31 vtrn.16 \r1, \r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
32 vtrn.16 \r4, \r6
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
33 vtrn.16 \r5, \r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
34 vtrn.8 \r0, \r1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
35 vtrn.8 \r2, \r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
36 vtrn.8 \r4, \r5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
37 vtrn.8 \r6, \r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
38 .endm
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
39
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
40 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
41 vswp \r0, \r4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
42 vswp \r1, \r5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
43 vswp \r2, \r6
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
44 vswp \r3, \r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
45 .endm
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
46
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
47 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
48 vtrn.32 \r0, \r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
49 vtrn.32 \r1, \r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
50 vtrn.32 \r4, \r6
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
51 vtrn.32 \r5, \r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
52 vtrn.16 \r0, \r1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
53 vtrn.16 \r2, \r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
54 vtrn.16 \r4, \r5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
55 vtrn.16 \r6, \r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
56 .endm
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
57
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
58 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
59 .macro h264_chroma_mc8 type
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
60 function ff_\type\()_h264_chroma_mc8_neon, export=1
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
61 push {r4-r7, lr}
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
62 ldrd r4, [sp, #20]
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
63 .ifc \type,avg
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
64 mov lr, r0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
65 .endif
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
66 pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
67 pld [r1, r2]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
68
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
69 muls r7, r4, r5
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
70 rsb r6, r7, r5, lsl #3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
71 rsb ip, r7, r4, lsl #3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
72 sub r4, r7, r4, lsl #3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
73 sub r4, r4, r5, lsl #3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
74 add r4, r4, #64
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
75
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
76 beq 2f
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
77
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
78 add r5, r1, r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
79
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
80 vdup.8 d0, r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
81 lsl r4, r2, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
82 vdup.8 d1, ip
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
83 vld1.64 {d4, d5}, [r1], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
84 vdup.8 d2, r6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
85 vld1.64 {d6, d7}, [r5], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
86 vdup.8 d3, r7
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
87
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
88 vext.8 d5, d4, d5, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
89 vext.8 d7, d6, d7, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
90
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
91 1: pld [r5]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
92 vmull.u8 q8, d4, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
93 vmlal.u8 q8, d5, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
94 vld1.64 {d4, d5}, [r1], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
95 vmlal.u8 q8, d6, d2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
96 vext.8 d5, d4, d5, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
97 vmlal.u8 q8, d7, d3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
98 vmull.u8 q9, d6, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
99 subs r3, r3, #2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
100 vmlal.u8 q9, d7, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
101 vmlal.u8 q9, d4, d2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
102 vmlal.u8 q9, d5, d3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
103 vrshrn.u16 d16, q8, #6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
104 vld1.64 {d6, d7}, [r5], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
105 pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
106 vrshrn.u16 d17, q9, #6
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
107 .ifc \type,avg
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
108 vld1.64 {d20}, [lr,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
109 vld1.64 {d21}, [lr,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
110 vrhadd.u8 q8, q8, q10
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
111 .endif
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
112 vext.8 d7, d6, d7, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
113 vst1.64 {d16}, [r0,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
114 vst1.64 {d17}, [r0,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
115 bgt 1b
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
116
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
117 pop {r4-r7, pc}
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
118
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
119 2: tst r6, r6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
120 add ip, ip, r6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
121 vdup.8 d0, r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
122 vdup.8 d1, ip
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
123
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
124 beq 4f
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
125
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
126 add r5, r1, r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
127 lsl r4, r2, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
128 vld1.64 {d4}, [r1], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
129 vld1.64 {d6}, [r5], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
130
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
131 3: pld [r5]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
132 vmull.u8 q8, d4, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
133 vmlal.u8 q8, d6, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
134 vld1.64 {d4}, [r1], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
135 vmull.u8 q9, d6, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
136 vmlal.u8 q9, d4, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
137 vld1.64 {d6}, [r5], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
138 vrshrn.u16 d16, q8, #6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
139 vrshrn.u16 d17, q9, #6
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
140 .ifc \type,avg
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
141 vld1.64 {d20}, [lr,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
142 vld1.64 {d21}, [lr,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
143 vrhadd.u8 q8, q8, q10
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
144 .endif
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
145 subs r3, r3, #2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
146 pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
147 vst1.64 {d16}, [r0,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
148 vst1.64 {d17}, [r0,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
149 bgt 3b
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
150
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
151 pop {r4-r7, pc}
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
152
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
153 4: vld1.64 {d4, d5}, [r1], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
154 vld1.64 {d6, d7}, [r1], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
155 vext.8 d5, d4, d5, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
156 vext.8 d7, d6, d7, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
157
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
158 5: pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
159 subs r3, r3, #2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
160 vmull.u8 q8, d4, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
161 vmlal.u8 q8, d5, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
162 vld1.64 {d4, d5}, [r1], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
163 vmull.u8 q9, d6, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
164 vmlal.u8 q9, d7, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
165 pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
166 vext.8 d5, d4, d5, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
167 vrshrn.u16 d16, q8, #6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
168 vrshrn.u16 d17, q9, #6
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
169 .ifc \type,avg
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
170 vld1.64 {d20}, [lr,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
171 vld1.64 {d21}, [lr,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
172 vrhadd.u8 q8, q8, q10
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
173 .endif
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
174 vld1.64 {d6, d7}, [r1], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
175 vext.8 d7, d6, d7, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
176 vst1.64 {d16}, [r0,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
177 vst1.64 {d17}, [r0,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
178 bgt 5b
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
179
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
180 pop {r4-r7, pc}
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
181 .endfunc
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
182 .endm
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
183
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
184 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
185 .macro h264_chroma_mc4 type
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
186 function ff_\type\()_h264_chroma_mc4_neon, export=1
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
187 push {r4-r7, lr}
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
188 ldrd r4, [sp, #20]
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
189 .ifc \type,avg
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
190 mov lr, r0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
191 .endif
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
192 pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
193 pld [r1, r2]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
194
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
195 muls r7, r4, r5
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
196 rsb r6, r7, r5, lsl #3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
197 rsb ip, r7, r4, lsl #3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
198 sub r4, r7, r4, lsl #3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
199 sub r4, r4, r5, lsl #3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
200 add r4, r4, #64
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
201
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
202 beq 2f
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
203
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
204 add r5, r1, r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
205
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
206 vdup.8 d0, r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
207 lsl r4, r2, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
208 vdup.8 d1, ip
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
209 vld1.64 {d4}, [r1], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
210 vdup.8 d2, r6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
211 vld1.64 {d6}, [r5], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
212 vdup.8 d3, r7
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
213
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
214 vext.8 d5, d4, d5, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
215 vext.8 d7, d6, d7, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
216 vtrn.32 d4, d5
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
217 vtrn.32 d6, d7
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
218
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
219 vtrn.32 d0, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
220 vtrn.32 d2, d3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
221
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
222 1: pld [r5]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
223 vmull.u8 q8, d4, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
224 vmlal.u8 q8, d6, d2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
225 vld1.64 {d4}, [r1], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
226 vext.8 d5, d4, d5, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
227 vtrn.32 d4, d5
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
228 vmull.u8 q9, d6, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
229 vmlal.u8 q9, d4, d2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
230 vld1.64 {d6}, [r5], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
231 vadd.i16 d16, d16, d17
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
232 vadd.i16 d17, d18, d19
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
233 vrshrn.u16 d16, q8, #6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
234 subs r3, r3, #2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
235 pld [r1]
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
236 .ifc \type,avg
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
237 vld1.32 {d20[0]}, [lr,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
238 vld1.32 {d20[1]}, [lr,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
239 vrhadd.u8 d16, d16, d20
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
240 .endif
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
241 vext.8 d7, d6, d7, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
242 vtrn.32 d6, d7
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
243 vst1.32 {d16[0]}, [r0,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
244 vst1.32 {d16[1]}, [r0,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
245 bgt 1b
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
246
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
247 pop {r4-r7, pc}
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
248
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
249 2: tst r6, r6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
250 add ip, ip, r6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
251 vdup.8 d0, r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
252 vdup.8 d1, ip
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
253 vtrn.32 d0, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
254
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
255 beq 4f
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
256
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
257 vext.32 d1, d0, d1, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
258 add r5, r1, r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
259 lsl r4, r2, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
260 vld1.32 {d4[0]}, [r1], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
261 vld1.32 {d4[1]}, [r5], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
262
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
263 3: pld [r5]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
264 vmull.u8 q8, d4, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
265 vld1.32 {d4[0]}, [r1], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
266 vmull.u8 q9, d4, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
267 vld1.32 {d4[1]}, [r5], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
268 vadd.i16 d16, d16, d17
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
269 vadd.i16 d17, d18, d19
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
270 vrshrn.u16 d16, q8, #6
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
271 .ifc \type,avg
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
272 vld1.32 {d20[0]}, [lr,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
273 vld1.32 {d20[1]}, [lr,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
274 vrhadd.u8 d16, d16, d20
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
275 .endif
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
276 subs r3, r3, #2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
277 pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
278 vst1.32 {d16[0]}, [r0,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
279 vst1.32 {d16[1]}, [r0,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
280 bgt 3b
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
281
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
282 pop {r4-r7, pc}
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
283
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
284 4: vld1.64 {d4}, [r1], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
285 vld1.64 {d6}, [r1], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
286 vext.8 d5, d4, d5, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
287 vext.8 d7, d6, d7, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
288 vtrn.32 d4, d5
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
289 vtrn.32 d6, d7
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
290
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
291 5: vmull.u8 q8, d4, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
292 vmull.u8 q9, d6, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
293 subs r3, r3, #2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
294 vld1.64 {d4}, [r1], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
295 vext.8 d5, d4, d5, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
296 vtrn.32 d4, d5
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
297 vadd.i16 d16, d16, d17
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
298 vadd.i16 d17, d18, d19
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
299 pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
300 vrshrn.u16 d16, q8, #6
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
301 .ifc \type,avg
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
302 vld1.32 {d20[0]}, [lr,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
303 vld1.32 {d20[1]}, [lr,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
304 vrhadd.u8 d16, d16, d20
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
305 .endif
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
306 vld1.64 {d6}, [r1], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
307 vext.8 d7, d6, d7, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
308 vtrn.32 d6, d7
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
309 pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
310 vst1.32 {d16[0]}, [r0,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
311 vst1.32 {d16[1]}, [r0,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
312 bgt 5b
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
313
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
314 pop {r4-r7, pc}
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
315 .endfunc
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
316 .endm
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
317
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
318 .text
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
319 .align
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
320
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
321 h264_chroma_mc8 put
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
322 h264_chroma_mc8 avg
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
323 h264_chroma_mc4 put
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
324 h264_chroma_mc4 avg
8337
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
325
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
326 /* H.264 loop filter */
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
327
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
328 .macro h264_loop_filter_start
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
329 ldr ip, [sp]
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
330 tst r2, r2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
331 ldr ip, [ip]
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
332 tstne r3, r3
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
333 vmov.32 d24[0], ip
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
334 and ip, ip, ip, lsl #16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
335 bxeq lr
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
336 ands ip, ip, ip, lsl #8
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
337 bxlt lr
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
338 .endm
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
339
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
340 .macro align_push_regs
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
341 and ip, sp, #15
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
342 add ip, ip, #32
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
343 sub sp, sp, ip
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
344 vst1.64 {d12-d15}, [sp,:128]
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
345 sub sp, sp, #32
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
346 vst1.64 {d8-d11}, [sp,:128]
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
347 .endm
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
348
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
349 .macro align_pop_regs
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
350 vld1.64 {d8-d11}, [sp,:128]!
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
351 vld1.64 {d12-d15}, [sp,:128], ip
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
352 .endm
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
353
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
354 .macro h264_loop_filter_luma
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
355 vdup.8 q11, r2 @ alpha
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
356 vmovl.u8 q12, d24
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
357 vabd.u8 q6, q8, q0 @ abs(p0 - q0)
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
358 vmovl.u16 q12, d24
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
359 vabd.u8 q14, q9, q8 @ abs(p1 - p0)
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
360 vsli.16 q12, q12, #8
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
361 vabd.u8 q15, q1, q0 @ abs(q1 - q0)
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
362 vsli.32 q12, q12, #16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
363 vclt.u8 q6, q6, q11 @ < alpha
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
364 vdup.8 q11, r3 @ beta
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
365 vclt.s8 q7, q12, #0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
366 vclt.u8 q14, q14, q11 @ < beta
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
367 vclt.u8 q15, q15, q11 @ < beta
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
368 vbic q6, q6, q7
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
369 vabd.u8 q4, q10, q8 @ abs(p2 - p0)
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
370 vand q6, q6, q14
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
371 vabd.u8 q5, q2, q0 @ abs(q2 - q0)
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
372 vclt.u8 q4, q4, q11 @ < beta
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
373 vand q6, q6, q15
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
374 vclt.u8 q5, q5, q11 @ < beta
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
375 vand q4, q4, q6
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
376 vand q5, q5, q6
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
377 vand q12, q12, q6
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
378 vrhadd.u8 q14, q8, q0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
379 vsub.i8 q6, q12, q4
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
380 vqadd.u8 q7, q9, q12
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
381 vhadd.u8 q10, q10, q14
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
382 vsub.i8 q6, q6, q5
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
383 vhadd.u8 q14, q2, q14
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
384 vmin.u8 q7, q7, q10
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
385 vqsub.u8 q11, q9, q12
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
386 vqadd.u8 q2, q1, q12
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
387 vmax.u8 q7, q7, q11
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
388 vqsub.u8 q11, q1, q12
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
389 vmin.u8 q14, q2, q14
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
390 vmovl.u8 q2, d0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
391 vmax.u8 q14, q14, q11
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
392 vmovl.u8 q10, d1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
393 vsubw.u8 q2, q2, d16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
394 vsubw.u8 q10, q10, d17
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
395 vshl.i16 q2, q2, #2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
396 vshl.i16 q10, q10, #2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
397 vaddw.u8 q2, q2, d18
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
398 vaddw.u8 q10, q10, d19
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
399 vsubw.u8 q2, q2, d2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
400 vsubw.u8 q10, q10, d3
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
401 vrshrn.i16 d4, q2, #3
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
402 vrshrn.i16 d5, q10, #3
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
403 vbsl q4, q7, q9
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
404 vbsl q5, q14, q1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
405 vneg.s8 q7, q6
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
406 vmovl.u8 q14, d16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
407 vmin.s8 q2, q2, q6
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
408 vmovl.u8 q6, d17
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
409 vmax.s8 q2, q2, q7
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
410 vmovl.u8 q11, d0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
411 vmovl.u8 q12, d1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
412 vaddw.s8 q14, q14, d4
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
413 vaddw.s8 q6, q6, d5
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
414 vsubw.s8 q11, q11, d4
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
415 vsubw.s8 q12, q12, d5
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
416 vqmovun.s16 d16, q14
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
417 vqmovun.s16 d17, q6
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
418 vqmovun.s16 d0, q11
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
419 vqmovun.s16 d1, q12
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
420 .endm
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
421
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
422 function ff_h264_v_loop_filter_luma_neon, export=1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
423 h264_loop_filter_start
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
424
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
425 vld1.64 {d0, d1}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
426 vld1.64 {d2, d3}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
427 vld1.64 {d4, d5}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
428 sub r0, r0, r1, lsl #2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
429 sub r0, r0, r1, lsl #1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
430 vld1.64 {d20,d21}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
431 vld1.64 {d18,d19}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
432 vld1.64 {d16,d17}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
433
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
434 align_push_regs
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
435
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
436 h264_loop_filter_luma
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
437
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
438 sub r0, r0, r1, lsl #1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
439 vst1.64 {d8, d9}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
440 vst1.64 {d16,d17}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
441 vst1.64 {d0, d1}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
442 vst1.64 {d10,d11}, [r0,:128]
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
443
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
444 align_pop_regs
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
445 bx lr
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
446 .endfunc
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
447
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
448 function ff_h264_h_loop_filter_luma_neon, export=1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
449 h264_loop_filter_start
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
450
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
451 sub r0, r0, #4
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
452 vld1.64 {d6}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
453 vld1.64 {d20}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
454 vld1.64 {d18}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
455 vld1.64 {d16}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
456 vld1.64 {d0}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
457 vld1.64 {d2}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
458 vld1.64 {d4}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
459 vld1.64 {d26}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
460 vld1.64 {d7}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
461 vld1.64 {d21}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
462 vld1.64 {d19}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
463 vld1.64 {d17}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
464 vld1.64 {d1}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
465 vld1.64 {d3}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
466 vld1.64 {d5}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
467 vld1.64 {d27}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
468
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
469 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
8337
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
470
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
471 align_push_regs
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
472 sub sp, sp, #16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
473 vst1.64 {d4, d5}, [sp,:128]
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
474 sub sp, sp, #16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
475 vst1.64 {d20,d21}, [sp,:128]
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
476
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
477 h264_loop_filter_luma
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
478
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
479 vld1.64 {d20,d21}, [sp,:128]!
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
480 vld1.64 {d4, d5}, [sp,:128]!
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
481
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
482 transpose_8x8 q3, q10, q4, q8, q0, q5, q2, q13
8337
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
483
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
484 sub r0, r0, r1, lsl #4
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
485 vst1.64 {d6}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
486 vst1.64 {d20}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
487 vst1.64 {d8}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
488 vst1.64 {d16}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
489 vst1.64 {d0}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
490 vst1.64 {d10}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
491 vst1.64 {d4}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
492 vst1.64 {d26}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
493 vst1.64 {d7}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
494 vst1.64 {d21}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
495 vst1.64 {d9}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
496 vst1.64 {d17}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
497 vst1.64 {d1}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
498 vst1.64 {d11}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
499 vst1.64 {d5}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
500 vst1.64 {d27}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
501
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
502 align_pop_regs
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
503 bx lr
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
504 .endfunc
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
505
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
506 .macro h264_loop_filter_chroma
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
507 vdup.8 d22, r2 @ alpha
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
508 vmovl.u8 q12, d24
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
509 vabd.u8 d26, d16, d0 @ abs(p0 - q0)
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
510 vmovl.u8 q2, d0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
511 vabd.u8 d28, d18, d16 @ abs(p1 - p0)
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
512 vsubw.u8 q2, q2, d16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
513 vsli.16 d24, d24, #8
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
514 vshl.i16 q2, q2, #2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
515 vabd.u8 d30, d2, d0 @ abs(q1 - q0)
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
516 vaddw.u8 q2, q2, d18
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
517 vclt.u8 d26, d26, d22 @ < alpha
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
518 vsubw.u8 q2, q2, d2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
519 vdup.8 d22, r3 @ beta
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
520 vclt.s8 d25, d24, #0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
521 vrshrn.i16 d4, q2, #3
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
522 vclt.u8 d28, d28, d22 @ < beta
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
523 vbic d26, d26, d25
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
524 vclt.u8 d30, d30, d22 @ < beta
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
525 vand d26, d26, d28
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
526 vneg.s8 d25, d24
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
527 vand d26, d26, d30
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
528 vmin.s8 d4, d4, d24
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
529 vmovl.u8 q14, d16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
530 vand d4, d4, d26
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
531 vmax.s8 d4, d4, d25
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
532 vmovl.u8 q11, d0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
533 vaddw.s8 q14, q14, d4
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
534 vsubw.s8 q11, q11, d4
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
535 vqmovun.s16 d16, q14
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
536 vqmovun.s16 d0, q11
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
537 .endm
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
538
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
539 function ff_h264_v_loop_filter_chroma_neon, export=1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
540 h264_loop_filter_start
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
541
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
542 sub r0, r0, r1, lsl #1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
543 vld1.64 {d18}, [r0,:64], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
544 vld1.64 {d16}, [r0,:64], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
545 vld1.64 {d0}, [r0,:64], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
546 vld1.64 {d2}, [r0,:64]
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
547
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
548 h264_loop_filter_chroma
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
549
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
550 sub r0, r0, r1, lsl #1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
551 vst1.64 {d16}, [r0,:64], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
552 vst1.64 {d0}, [r0,:64], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
553
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
554 bx lr
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
555 .endfunc
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
556
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
557 function ff_h264_h_loop_filter_chroma_neon, export=1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
558 h264_loop_filter_start
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
559
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
560 sub r0, r0, #2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
561 vld1.32 {d18[0]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
562 vld1.32 {d16[0]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
563 vld1.32 {d0[0]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
564 vld1.32 {d2[0]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
565 vld1.32 {d18[1]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
566 vld1.32 {d16[1]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
567 vld1.32 {d0[1]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
568 vld1.32 {d2[1]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
569
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
570 vtrn.16 d18, d0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
571 vtrn.16 d16, d2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
572 vtrn.8 d18, d16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
573 vtrn.8 d0, d2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
574
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
575 h264_loop_filter_chroma
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
576
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
577 vtrn.16 d18, d0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
578 vtrn.16 d16, d2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
579 vtrn.8 d18, d16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
580 vtrn.8 d0, d2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
581
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
582 sub r0, r0, r1, lsl #3
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
583 vst1.32 {d18[0]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
584 vst1.32 {d16[0]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
585 vst1.32 {d0[0]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
586 vst1.32 {d2[0]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
587 vst1.32 {d18[1]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
588 vst1.32 {d16[1]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
589 vst1.32 {d0[1]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
590 vst1.32 {d2[1]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
591
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
592 bx lr
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
593 .endfunc
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
594
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
595 /* H.264 qpel MC */
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
596
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
597 .macro lowpass_const r
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
598 movw \r, #5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
599 movt \r, #20
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
600 vmov.32 d6[0], \r
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
601 .endm
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
602
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
603 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
604 .if \narrow
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
605 t0 .req q0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
606 t1 .req q8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
607 .else
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
608 t0 .req \d0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
609 t1 .req \d1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
610 .endif
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
611 vext.8 d2, \r0, \r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
612 vext.8 d3, \r0, \r1, #3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
613 vaddl.u8 q1, d2, d3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
614 vext.8 d4, \r0, \r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
615 vext.8 d5, \r0, \r1, #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
616 vaddl.u8 q2, d4, d5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
617 vext.8 d30, \r0, \r1, #5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
618 vaddl.u8 t0, \r0, d30
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
619 vext.8 d18, \r2, \r3, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
620 vmla.i16 t0, q1, d6[1]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
621 vext.8 d19, \r2, \r3, #3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
622 vaddl.u8 q9, d18, d19
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
623 vext.8 d20, \r2, \r3, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
624 vmls.i16 t0, q2, d6[0]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
625 vext.8 d21, \r2, \r3, #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
626 vaddl.u8 q10, d20, d21
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
627 vext.8 d31, \r2, \r3, #5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
628 vaddl.u8 t1, \r2, d31
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
629 vmla.i16 t1, q9, d6[1]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
630 vmls.i16 t1, q10, d6[0]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
631 .if \narrow
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
632 vqrshrun.s16 \d0, t0, #5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
633 vqrshrun.s16 \d1, t1, #5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
634 .endif
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
635 .unreq t0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
636 .unreq t1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
637 .endm
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
638
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
639 .macro lowpass_8_1 r0, r1, d0, narrow=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
640 .if \narrow
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
641 t0 .req q0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
642 .else
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
643 t0 .req \d0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
644 .endif
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
645 vext.8 d2, \r0, \r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
646 vext.8 d3, \r0, \r1, #3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
647 vaddl.u8 q1, d2, d3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
648 vext.8 d4, \r0, \r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
649 vext.8 d5, \r0, \r1, #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
650 vaddl.u8 q2, d4, d5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
651 vext.8 d30, \r0, \r1, #5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
652 vaddl.u8 t0, \r0, d30
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
653 vmla.i16 t0, q1, d6[1]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
654 vmls.i16 t0, q2, d6[0]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
655 .if \narrow
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
656 vqrshrun.s16 \d0, t0, #5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
657 .endif
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
658 .unreq t0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
659 .endm
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
660
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
661 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
662 vext.16 q1, \r0, \r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
663 vext.16 q0, \r0, \r1, #3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
664 vaddl.s16 q9, d2, d0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
665 vext.16 q2, \r0, \r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
666 vaddl.s16 q1, d3, d1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
667 vext.16 q3, \r0, \r1, #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
668 vaddl.s16 q10, d4, d6
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
669 vext.16 \r1, \r0, \r1, #5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
670 vaddl.s16 q2, d5, d7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
671 vaddl.s16 q0, \h0, \h1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
672 vaddl.s16 q8, \l0, \l1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
673
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
674 vshl.i32 q3, q9, #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
675 vshl.i32 q9, q9, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
676 vshl.i32 q15, q10, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
677 vadd.i32 q9, q9, q3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
678 vadd.i32 q10, q10, q15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
679
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
680 vshl.i32 q3, q1, #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
681 vshl.i32 q1, q1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
682 vshl.i32 q15, q2, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
683 vadd.i32 q1, q1, q3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
684 vadd.i32 q2, q2, q15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
685
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
686 vadd.i32 q9, q9, q8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
687 vsub.i32 q9, q9, q10
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
688
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
689 vadd.i32 q1, q1, q0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
690 vsub.i32 q1, q1, q2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
691
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
692 vrshrn.s32 d18, q9, #10
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
693 vrshrn.s32 d19, q1, #10
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
694
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
695 vqmovun.s16 \d, q9
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
696 .endm
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
697
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
698 function put_h264_qpel16_h_lowpass_neon_packed
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
699 mov r4, lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
700 mov ip, #16
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
701 mov r3, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
702 bl put_h264_qpel8_h_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
703 sub r1, r1, r2, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
704 add r1, r1, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
705 mov ip, #16
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
706 mov lr, r4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
707 b put_h264_qpel8_h_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
708 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
709
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
710 function put_h264_qpel16_h_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
711 push {lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
712 mov ip, #16
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
713 bl put_h264_qpel8_h_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
714 sub r0, r0, r3, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
715 sub r1, r1, r2, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
716 add r0, r0, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
717 add r1, r1, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
718 mov ip, #16
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
719 pop {lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
720 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
721
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
722 function put_h264_qpel8_h_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
723 1: vld1.64 {d0, d1}, [r1], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
724 vld1.64 {d16,d17}, [r1], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
725 subs ip, ip, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
726 lowpass_8 d0, d1, d16, d17, d0, d16
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
727 vst1.64 {d0}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
728 vst1.64 {d16}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
729 bne 1b
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
730 bx lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
731 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
732
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
733 function put_h264_qpel16_h_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
734 push {lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
735 mov ip, #16
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
736 bl put_h264_qpel8_h_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
737 sub r0, r0, r2, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
738 sub r1, r1, r2, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
739 sub r3, r3, r2, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
740 add r0, r0, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
741 add r1, r1, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
742 add r3, r3, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
743 mov ip, #16
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
744 pop {lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
745 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
746
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
747 function put_h264_qpel8_h_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
748 1: vld1.64 {d0, d1}, [r1], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
749 vld1.64 {d16,d17}, [r1], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
750 vld1.64 {d28}, [r3], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
751 vld1.64 {d29}, [r3], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
752 subs ip, ip, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
753 lowpass_8 d0, d1, d16, d17, d0, d1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
754 vrhadd.u8 q0, q0, q14
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
755 vst1.64 {d0}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
756 vst1.64 {d1}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
757 bne 1b
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
758 bx lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
759 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
760
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
761 function put_h264_qpel16_v_lowpass_neon_packed
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
762 mov r4, lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
763 mov r2, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
764 bl put_h264_qpel8_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
765 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
766 bl put_h264_qpel8_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
767 sub r1, r1, r3, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
768 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
769 add r1, r1, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
770 bl put_h264_qpel8_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
771 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
772 mov lr, r4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
773 b put_h264_qpel8_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
774 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
775
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
776 function put_h264_qpel16_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
777 mov r4, lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
778 bl put_h264_qpel8_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
779 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
780 bl put_h264_qpel8_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
781 sub r0, r0, r2, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
782 add r0, r0, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
783 sub r1, r1, r3, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
784 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
785 add r1, r1, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
786 bl put_h264_qpel8_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
787 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
788 mov lr, r4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
789 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
790
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
791 function put_h264_qpel8_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
792 vld1.64 {d8}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
793 vld1.64 {d10}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
794 vld1.64 {d12}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
795 vld1.64 {d14}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
796 vld1.64 {d22}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
797 vld1.64 {d24}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
798 vld1.64 {d26}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
799 vld1.64 {d28}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
800 vld1.64 {d9}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
801 vld1.64 {d11}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
802 vld1.64 {d13}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
803 vld1.64 {d15}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
804 vld1.64 {d23}, [r1]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
805
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
806 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
807 lowpass_8 d8, d9, d10, d11, d8, d10
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
808 lowpass_8 d12, d13, d14, d15, d12, d14
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
809 lowpass_8 d22, d23, d24, d25, d22, d24
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
810 lowpass_8 d26, d27, d28, d29, d26, d28
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
811 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
812
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
813 vst1.64 {d8}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
814 vst1.64 {d10}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
815 vst1.64 {d12}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
816 vst1.64 {d14}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
817 vst1.64 {d22}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
818 vst1.64 {d24}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
819 vst1.64 {d26}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
820 vst1.64 {d28}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
821
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
822 bx lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
823 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
824
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
825 function put_h264_qpel16_v_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
826 mov r4, lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
827 bl put_h264_qpel8_v_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
828 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
829 bl put_h264_qpel8_v_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
830 sub r0, r0, r3, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
831 sub ip, ip, r2, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
832 add r0, r0, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
833 add ip, ip, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
834 sub r1, r1, r3, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
835 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
836 add r1, r1, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
837 bl put_h264_qpel8_v_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
838 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
839 mov lr, r4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
840 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
841
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
842 function put_h264_qpel8_v_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
843 vld1.64 {d8}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
844 vld1.64 {d10}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
845 vld1.64 {d12}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
846 vld1.64 {d14}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
847 vld1.64 {d22}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
848 vld1.64 {d24}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
849 vld1.64 {d26}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
850 vld1.64 {d28}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
851 vld1.64 {d9}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
852 vld1.64 {d11}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
853 vld1.64 {d13}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
854 vld1.64 {d15}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
855 vld1.64 {d23}, [r1]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
856
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
857 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
858 lowpass_8 d8, d9, d10, d11, d8, d9
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
859 lowpass_8 d12, d13, d14, d15, d12, d13
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
860 lowpass_8 d22, d23, d24, d25, d22, d23
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
861 lowpass_8 d26, d27, d28, d29, d26, d27
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
862 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
863
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
864 vld1.64 {d0}, [ip], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
865 vld1.64 {d1}, [ip], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
866 vld1.64 {d2}, [ip], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
867 vld1.64 {d3}, [ip], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
868 vld1.64 {d4}, [ip], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
869 vrhadd.u8 q0, q0, q4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
870 vld1.64 {d5}, [ip], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
871 vrhadd.u8 q1, q1, q6
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
872 vld1.64 {d10}, [ip], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
873 vrhadd.u8 q2, q2, q11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
874 vld1.64 {d11}, [ip], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
875
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
876 vst1.64 {d0}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
877 vst1.64 {d1}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
878 vrhadd.u8 q5, q5, q13
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
879 vst1.64 {d2}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
880 vst1.64 {d3}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
881 vst1.64 {d4}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
882 vst1.64 {d5}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
883 vst1.64 {d10}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
884 vst1.64 {d11}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
885
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
886 bx lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
887 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
888
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
889 function put_h264_qpel8_hv_lowpass_neon_top
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
890 lowpass_const ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
891 mov ip, #12
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
892 1: vld1.64 {d0, d1}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
893 vld1.64 {d16,d17}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
894 subs ip, ip, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
895 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
896 vst1.64 {d22-d25}, [r4,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
897 bne 1b
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
898
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
899 vld1.64 {d0, d1}, [r1]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
900 lowpass_8_1 d0, d1, q12, narrow=0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
901
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
902 mov ip, #-16
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
903 add r4, r4, ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
904 vld1.64 {d30,d31}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
905 vld1.64 {d20,d21}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
906 vld1.64 {d18,d19}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
907 vld1.64 {d16,d17}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
908 vld1.64 {d14,d15}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
909 vld1.64 {d12,d13}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
910 vld1.64 {d10,d11}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
911 vld1.64 {d8, d9}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
912 vld1.64 {d6, d7}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
913 vld1.64 {d4, d5}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
914 vld1.64 {d2, d3}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
915 vld1.64 {d0, d1}, [r4,:128]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
916
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
917 swap4 d1, d3, d5, d7, d8, d10, d12, d14
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
918 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
919
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
920 swap4 d17, d19, d21, d31, d24, d26, d28, d22
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
921 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
922
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
923 vst1.64 {d30,d31}, [r4,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
924 vst1.64 {d6, d7}, [r4,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
925 vst1.64 {d20,d21}, [r4,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
926 vst1.64 {d4, d5}, [r4,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
927 vst1.64 {d18,d19}, [r4,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
928 vst1.64 {d2, d3}, [r4,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
929 vst1.64 {d16,d17}, [r4,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
930 vst1.64 {d0, d1}, [r4,:128]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
931
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
932 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
933 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
934 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
935 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
936
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
937 vld1.64 {d16,d17}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
938 vld1.64 {d30,d31}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
939 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
940 vld1.64 {d16,d17}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
941 vld1.64 {d30,d31}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
942 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
943 vld1.64 {d16,d17}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
944 vld1.64 {d30,d31}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
945 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
946 vld1.64 {d16,d17}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
947 vld1.64 {d30,d31}, [r4,:128]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
948 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
949
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
950 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
951
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
952 bx lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
953 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
954
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
955 function put_h264_qpel8_hv_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
956 mov r10, lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
957 bl put_h264_qpel8_hv_lowpass_neon_top
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
958 vst1.64 {d12}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
959 vst1.64 {d13}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
960 vst1.64 {d14}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
961 vst1.64 {d15}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
962 vst1.64 {d8}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
963 vst1.64 {d9}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
964 vst1.64 {d10}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
965 vst1.64 {d11}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
966
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
967 mov lr, r10
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
968 bx lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
969 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
970
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
971 function put_h264_qpel8_hv_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
972 mov r10, lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
973 bl put_h264_qpel8_hv_lowpass_neon_top
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
974
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
975 vld1.64 {d0, d1}, [r2,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
976 vld1.64 {d2, d3}, [r2,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
977 vrhadd.u8 q0, q0, q6
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
978 vld1.64 {d4, d5}, [r2,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
979 vrhadd.u8 q1, q1, q7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
980 vld1.64 {d6, d7}, [r2,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
981 vrhadd.u8 q2, q2, q4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
982
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
983 vst1.64 {d0}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
984 vrhadd.u8 q3, q3, q5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
985 vst1.64 {d1}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
986 vst1.64 {d2}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
987 vst1.64 {d3}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
988 vst1.64 {d4}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
989 vst1.64 {d5}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
990 vst1.64 {d6}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
991 vst1.64 {d7}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
992
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
993 mov lr, r10
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
994 bx lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
995 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
996
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
997 function put_h264_qpel16_hv_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
998 mov r9, lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
999 bl put_h264_qpel8_hv_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1000 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1001 bl put_h264_qpel8_hv_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1002 sub r1, r1, r3, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1003 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1004 add r1, r1, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1005 sub r0, r0, r2, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1006 add r0, r0, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1007 bl put_h264_qpel8_hv_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1008 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1009 mov lr, r9
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1010 b put_h264_qpel8_hv_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1011 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1012
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1013 function put_h264_qpel16_hv_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1014 mov r9, lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1015 sub r2, r4, #256
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1016 bl put_h264_qpel8_hv_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1017 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1018 bl put_h264_qpel8_hv_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1019 sub r1, r1, r3, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1020 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1021 add r1, r1, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1022 sub r0, r0, r3, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1023 add r0, r0, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1024 bl put_h264_qpel8_hv_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1025 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1026 mov lr, r9
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1027 b put_h264_qpel8_hv_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1028 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1029
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1030 function ff_put_h264_qpel8_mc10_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1031 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1032 mov r3, r1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1033 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1034 mov ip, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1035 b put_h264_qpel8_h_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1036 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1037
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1038 function ff_put_h264_qpel8_mc20_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1039 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1040 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1041 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1042 mov ip, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1043 b put_h264_qpel8_h_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1044 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1045
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1046 function ff_put_h264_qpel8_mc30_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1047 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1048 add r3, r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1049 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1050 mov ip, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1051 b put_h264_qpel8_h_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1052 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1053
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1054 function ff_put_h264_qpel8_mc01_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1055 push {lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1056 mov ip, r1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1057 put_h264_qpel8_mc01:
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1058 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1059 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1060 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1061 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1062 bl put_h264_qpel8_v_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1063 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1064 pop {pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1065 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1066
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1067 function ff_put_h264_qpel8_mc11_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1068 push {r0, r1, r2, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1069 put_h264_qpel8_mc11:
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1070 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1071 sub sp, sp, #64
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1072 mov r0, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1073 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1074 mov r3, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1075 mov ip, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1076 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1077 bl put_h264_qpel8_h_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1078 ldrd r0, [sp, #128]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1079 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1080 add ip, sp, #64
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1081 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1082 mov r2, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1083 bl put_h264_qpel8_v_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1084 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1085 add sp, sp, #76
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1086 pop {pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1087 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1088
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1089 function ff_put_h264_qpel8_mc21_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1090 push {r0, r1, r4, r10, r11, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1091 put_h264_qpel8_mc21:
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1092 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1093 mov r11, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1094 bic sp, sp, #15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1095 sub sp, sp, #(8*8+16*12)
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1096 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1097 mov r3, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1098 mov r0, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1099 mov ip, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1100 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1101 bl put_h264_qpel8_h_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1102 mov r4, r0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1103 ldrd r0, [r11]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1104 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1105 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1106 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1107 sub r2, r4, #64
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1108 bl put_h264_qpel8_hv_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1109 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1110 add sp, r11, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1111 pop {r4, r10, r11, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1112 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1113
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1114 function ff_put_h264_qpel8_mc31_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1115 add r1, r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1116 push {r0, r1, r2, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1117 sub r1, r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1118 b put_h264_qpel8_mc11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1119 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1120
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1121 function ff_put_h264_qpel8_mc02_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1122 push {lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1123 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1124 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1125 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1126 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1127 bl put_h264_qpel8_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1128 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1129 pop {pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1130 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1131
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1132 function ff_put_h264_qpel8_mc12_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1133 push {r0, r1, r4, r10, r11, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1134 put_h264_qpel8_mc12:
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1135 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1136 mov r11, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1137 bic sp, sp, #15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1138 sub sp, sp, #(8*8+16*12)
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1139 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1140 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1141 mov r2, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1142 mov r0, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1143 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1144 bl put_h264_qpel8_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1145 mov r4, r0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1146 ldrd r0, [r11]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1147 sub r1, r1, r3, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1148 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1149 sub r2, r4, #64
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1150 bl put_h264_qpel8_hv_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1151 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1152 add sp, r11, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1153 pop {r4, r10, r11, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1154 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1155
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1156 function ff_put_h264_qpel8_mc22_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1157 push {r4, r10, r11, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1158 mov r11, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1159 bic sp, sp, #15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1160 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1161 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1162 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1163 sub sp, sp, #(16*12)
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1164 mov r4, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1165 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1166 bl put_h264_qpel8_hv_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1167 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1168 mov sp, r11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1169 pop {r4, r10, r11, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1170 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1171
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1172 function ff_put_h264_qpel8_mc32_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1173 push {r0, r1, r4, r10, r11, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1174 add r1, r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1175 b put_h264_qpel8_mc12
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1176 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1177
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1178 function ff_put_h264_qpel8_mc03_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1179 push {lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1180 add ip, r1, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1181 b put_h264_qpel8_mc01
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1182 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1183
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1184 function ff_put_h264_qpel8_mc13_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1185 push {r0, r1, r2, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1186 add r1, r1, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1187 b put_h264_qpel8_mc11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1188 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1189
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1190 function ff_put_h264_qpel8_mc23_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1191 push {r0, r1, r4, r10, r11, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1192 add r1, r1, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1193 b put_h264_qpel8_mc21
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1194 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1195
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1196 function ff_put_h264_qpel8_mc33_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1197 add r1, r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1198 push {r0, r1, r2, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1199 add r1, r1, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1200 sub r1, r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1201 b put_h264_qpel8_mc11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1202 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1203
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1204 function ff_put_h264_qpel16_mc10_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1205 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1206 mov r3, r1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1207 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1208 b put_h264_qpel16_h_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1209 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1210
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1211 function ff_put_h264_qpel16_mc20_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1212 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1213 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1214 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1215 b put_h264_qpel16_h_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1216 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1217
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1218 function ff_put_h264_qpel16_mc30_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1219 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1220 add r3, r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1221 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1222 b put_h264_qpel16_h_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1223 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1224
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1225 function ff_put_h264_qpel16_mc01_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1226 push {r4, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1227 mov ip, r1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1228 put_h264_qpel16_mc01:
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1229 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1230 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1231 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1232 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1233 bl put_h264_qpel16_v_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1234 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1235 pop {r4, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1236 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1237
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1238 function ff_put_h264_qpel16_mc11_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1239 push {r0, r1, r4, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1240 put_h264_qpel16_mc11:
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1241 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1242 sub sp, sp, #256
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1243 mov r0, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1244 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1245 mov r3, #16
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1246 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1247 bl put_h264_qpel16_h_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1248 add r0, sp, #256
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1249 ldrd r0, [r0, #64]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1250 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1251 add ip, sp, #64
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1252 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1253 mov r2, #16
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1254 bl put_h264_qpel16_v_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1255 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1256 add sp, sp, #(256+8)
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1257 pop {r4, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1258 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1259
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1260 function ff_put_h264_qpel16_mc21_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1261 push {r0, r1, r4-r5, r9-r11, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1262 put_h264_qpel16_mc21:
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1263 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1264 mov r11, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1265 bic sp, sp, #15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1266 sub sp, sp, #(16*16+16*12)
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1267 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1268 mov r0, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1269 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1270 bl put_h264_qpel16_h_lowpass_neon_packed
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1271 mov r4, r0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1272 ldrd r0, [r11]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1273 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1274 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1275 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1276 bl put_h264_qpel16_hv_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1277 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1278 add sp, r11, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1279 pop {r4-r5, r9-r11, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1280 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1281
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1282 function ff_put_h264_qpel16_mc31_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1283 add r1, r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1284 push {r0, r1, r4, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1285 sub r1, r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1286 b put_h264_qpel16_mc11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1287 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1288
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1289 function ff_put_h264_qpel16_mc02_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1290 push {r4, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1291 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1292 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1293 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1294 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1295 bl put_h264_qpel16_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1296 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1297 pop {r4, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1298 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1299
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1300 function ff_put_h264_qpel16_mc12_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1301 push {r0, r1, r4-r5, r9-r11, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1302 put_h264_qpel16_mc12:
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1303 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1304 mov r11, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1305 bic sp, sp, #15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1306 sub sp, sp, #(16*16+16*12)
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1307 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1308 mov r0, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1309 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1310 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1311 bl put_h264_qpel16_v_lowpass_neon_packed
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1312 mov r4, r0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1313 ldrd r0, [r11]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1314 sub r1, r1, r3, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1315 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1316 mov r2, r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1317 bl put_h264_qpel16_hv_lowpass_l2_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1318 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1319 add sp, r11, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1320 pop {r4-r5, r9-r11, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1321 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1322
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1323 function ff_put_h264_qpel16_mc22_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1324 push {r4, r9-r11, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1325 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1326 mov r11, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1327 bic sp, sp, #15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1328 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1329 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1330 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1331 sub sp, sp, #(16*12)
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1332 mov r4, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1333 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1334 bl put_h264_qpel16_hv_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1335 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1336 mov sp, r11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1337 pop {r4, r9-r11, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1338 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1339
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1340 function ff_put_h264_qpel16_mc32_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1341 push {r0, r1, r4-r5, r9-r11, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1342 add r1, r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1343 b put_h264_qpel16_mc12
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1344 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1345
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1346 function ff_put_h264_qpel16_mc03_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1347 push {r4, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1348 add ip, r1, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1349 b put_h264_qpel16_mc01
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1350 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1351
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1352 function ff_put_h264_qpel16_mc13_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1353 push {r0, r1, r4, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1354 add r1, r1, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1355 b put_h264_qpel16_mc11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1356 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1357
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1358 function ff_put_h264_qpel16_mc23_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1359 push {r0, r1, r4-r5, r9-r11, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1360 add r1, r1, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1361 b put_h264_qpel16_mc21
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1362 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1363
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1364 function ff_put_h264_qpel16_mc33_neon, export=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1365 add r1, r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1366 push {r0, r1, r4, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1367 add r1, r1, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1368 sub r1, r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1369 b put_h264_qpel16_mc11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1370 .endfunc
8663
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1371
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1372 @ Biweighted prediction
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1373
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1374 .macro biweight_16 macs, macd
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1375 vdup.8 d0, r4
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1376 vdup.8 d1, r5
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1377 vmov q2, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1378 vmov q3, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1379 1: subs ip, ip, #2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1380 vld1.8 {d20-d21},[r0,:128], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1381 \macd q2, d0, d20
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1382 pld [r0]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1383 \macd q3, d0, d21
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1384 vld1.8 {d22-d23},[r1,:128], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1385 \macs q2, d1, d22
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1386 pld [r1]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1387 \macs q3, d1, d23
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1388 vmov q12, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1389 vld1.8 {d28-d29},[r0,:128], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1390 vmov q13, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1391 \macd q12, d0, d28
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1392 pld [r0]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1393 \macd q13, d0, d29
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1394 vld1.8 {d30-d31},[r1,:128], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1395 \macs q12, d1, d30
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1396 pld [r1]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1397 \macs q13, d1, d31
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1398 vshl.s16 q2, q2, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1399 vshl.s16 q3, q3, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1400 vqmovun.s16 d4, q2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1401 vqmovun.s16 d5, q3
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1402 vshl.s16 q12, q12, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1403 vshl.s16 q13, q13, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1404 vqmovun.s16 d24, q12
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1405 vqmovun.s16 d25, q13
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1406 vmov q3, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1407 vst1.8 {d4- d5}, [r6,:128], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1408 vmov q2, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1409 vst1.8 {d24-d25},[r6,:128], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1410 bne 1b
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1411 pop {r4-r6, pc}
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1412 .endm
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1413
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1414 .macro biweight_8 macs, macd
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1415 vdup.8 d0, r4
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1416 vdup.8 d1, r5
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1417 vmov q1, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1418 vmov q10, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1419 1: subs ip, ip, #2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1420 vld1.8 {d4},[r0,:64], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1421 \macd q1, d0, d4
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1422 pld [r0]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1423 vld1.8 {d5},[r1,:64], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1424 \macs q1, d1, d5
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1425 pld [r1]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1426 vld1.8 {d6},[r0,:64], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1427 \macd q10, d0, d6
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1428 pld [r0]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1429 vld1.8 {d7},[r1,:64], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1430 \macs q10, d1, d7
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1431 pld [r1]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1432 vshl.s16 q1, q1, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1433 vqmovun.s16 d2, q1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1434 vshl.s16 q10, q10, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1435 vqmovun.s16 d4, q10
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1436 vmov q10, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1437 vst1.8 {d2},[r6,:64], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1438 vmov q1, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1439 vst1.8 {d4},[r6,:64], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1440 bne 1b
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1441 pop {r4-r6, pc}
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1442 .endm
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1443
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1444 .macro biweight_4 macs, macd
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1445 vdup.8 d0, r4
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1446 vdup.8 d1, r5
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1447 vmov q1, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1448 vmov q10, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1449 1: subs ip, ip, #4
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1450 vld1.32 {d4[0]},[r0,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1451 vld1.32 {d4[1]},[r0,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1452 \macd q1, d0, d4
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1453 pld [r0]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1454 vld1.32 {d5[0]},[r1,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1455 vld1.32 {d5[1]},[r1,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1456 \macs q1, d1, d5
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1457 pld [r1]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1458 blt 2f
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1459 vld1.32 {d6[0]},[r0,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1460 vld1.32 {d6[1]},[r0,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1461 \macd q10, d0, d6
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1462 pld [r0]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1463 vld1.32 {d7[0]},[r1,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1464 vld1.32 {d7[1]},[r1,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1465 \macs q10, d1, d7
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1466 pld [r1]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1467 vshl.s16 q1, q1, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1468 vqmovun.s16 d2, q1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1469 vshl.s16 q10, q10, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1470 vqmovun.s16 d4, q10
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1471 vmov q10, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1472 vst1.32 {d2[0]},[r6,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1473 vst1.32 {d2[1]},[r6,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1474 vmov q1, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1475 vst1.32 {d4[0]},[r6,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1476 vst1.32 {d4[1]},[r6,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1477 bne 1b
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1478 pop {r4-r6, pc}
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1479 2: vshl.s16 q1, q1, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1480 vqmovun.s16 d2, q1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1481 vst1.32 {d2[0]},[r6,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1482 vst1.32 {d2[1]},[r6,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1483 pop {r4-r6, pc}
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1484 .endm
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1485
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1486 .macro biweight_func w
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1487 function biweight_h264_pixels_\w\()_neon
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1488 push {r4-r6, lr}
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1489 add r4, sp, #16
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1490 ldm r4, {r4-r6}
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1491 lsr lr, r4, #31
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1492 add r6, r6, #1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1493 eors lr, lr, r5, lsr #30
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1494 orr r6, r6, #1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1495 vdup.16 q9, r3
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1496 lsl r6, r6, r3
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1497 vmvn q9, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1498 vdup.16 q8, r6
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1499 mov r6, r0
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1500 beq 10f
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1501 subs lr, lr, #1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1502 beq 20f
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1503 subs lr, lr, #1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1504 beq 30f
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1505 b 40f
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1506 10: biweight_\w vmlal.u8, vmlal.u8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1507 20: rsb r4, r4, #0
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1508 biweight_\w vmlal.u8, vmlsl.u8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1509 30: rsb r4, r4, #0
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1510 rsb r5, r5, #0
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1511 biweight_\w vmlsl.u8, vmlsl.u8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1512 40: rsb r5, r5, #0
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1513 biweight_\w vmlsl.u8, vmlal.u8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1514 .endfunc
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1515 .endm
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1516
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1517 .macro biweight_entry w, h, b=1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1518 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1519 mov ip, #\h
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1520 .if \b
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1521 b biweight_h264_pixels_\w\()_neon
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1522 .endif
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1523 .endfunc
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1524 .endm
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1525
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1526 biweight_entry 16, 8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1527 biweight_entry 16, 16, b=0
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1528 biweight_func 16
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1529
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1530 biweight_entry 8, 16
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1531 biweight_entry 8, 4
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1532 biweight_entry 8, 8, b=0
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1533 biweight_func 8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1534
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1535 biweight_entry 4, 8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1536 biweight_entry 4, 2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1537 biweight_entry 4, 4, b=0
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1538 biweight_func 4
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1539
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1540 @ Weighted prediction
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1541
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1542 .macro weight_16 add
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1543 vdup.8 d0, r3
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1544 1: subs ip, ip, #2
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1545 vld1.8 {d20-d21},[r0,:128], r1
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1546 vmull.u8 q2, d0, d20
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1547 pld [r0]
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1548 vmull.u8 q3, d0, d21
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1549 vld1.8 {d28-d29},[r0,:128], r1
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1550 vmull.u8 q12, d0, d28
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1551 pld [r0]
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1552 vmull.u8 q13, d0, d29
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1553 \add q2, q8, q2
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1554 vrshl.s16 q2, q2, q9
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1555 \add q3, q8, q3
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1556 vrshl.s16 q3, q3, q9
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1557 vqmovun.s16 d4, q2
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1558 vqmovun.s16 d5, q3
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1559 \add q12, q8, q12
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1560 vrshl.s16 q12, q12, q9
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1561 \add q13, q8, q13
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1562 vrshl.s16 q13, q13, q9
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1563 vqmovun.s16 d24, q12
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1564 vqmovun.s16 d25, q13
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1565 vst1.8 {d4- d5}, [r4,:128], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1566 vst1.8 {d24-d25},[r4,:128], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1567 bne 1b
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1568 pop {r4, pc}
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1569 .endm
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1570
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1571 .macro weight_8 add
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1572 vdup.8 d0, r3
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1573 1: subs ip, ip, #2
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1574 vld1.8 {d4},[r0,:64], r1
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1575 vmull.u8 q1, d0, d4
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1576 pld [r0]
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1577 vld1.8 {d6},[r0,:64], r1
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1578 vmull.u8 q10, d0, d6
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1579 \add q1, q8, q1
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1580 pld [r0]
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1581 vrshl.s16 q1, q1, q9
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1582 vqmovun.s16 d2, q1
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1583 \add q10, q8, q10
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1584 vrshl.s16 q10, q10, q9
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1585 vqmovun.s16 d4, q10
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1586 vst1.8 {d2},[r4,:64], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1587 vst1.8 {d4},[r4,:64], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1588 bne 1b
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1589 pop {r4, pc}
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1590 .endm
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1591
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1592 .macro weight_4 add
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1593 vdup.8 d0, r3
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1594 vmov q1, q8
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1595 vmov q10, q8
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1596 1: subs ip, ip, #4
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1597 vld1.32 {d4[0]},[r0,:32], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1598 vld1.32 {d4[1]},[r0,:32], r1
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1599 vmull.u8 q1, d0, d4
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1600 pld [r0]
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1601 blt 2f
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1602 vld1.32 {d6[0]},[r0,:32], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1603 vld1.32 {d6[1]},[r0,:32], r1
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1604 vmull.u8 q10, d0, d6
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1605 pld [r0]
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1606 \add q1, q8, q1
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1607 vrshl.s16 q1, q1, q9
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1608 vqmovun.s16 d2, q1
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1609 \add q10, q8, q10
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1610 vrshl.s16 q10, q10, q9
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1611 vqmovun.s16 d4, q10
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1612 vmov q10, q8
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1613 vst1.32 {d2[0]},[r4,:32], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1614 vst1.32 {d2[1]},[r4,:32], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1615 vmov q1, q8
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1616 vst1.32 {d4[0]},[r4,:32], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1617 vst1.32 {d4[1]},[r4,:32], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1618 bne 1b
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1619 pop {r4, pc}
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1620 2: \add q1, q8, q1
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1621 vrshl.s16 q1, q1, q9
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1622 vqmovun.s16 d2, q1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1623 vst1.32 {d2[0]},[r4,:32], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1624 vst1.32 {d2[1]},[r4,:32], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1625 pop {r4, pc}
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1626 .endm
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1627
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1628 .macro weight_func w
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1629 function weight_h264_pixels_\w\()_neon
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1630 push {r4, lr}
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1631 ldr r4, [sp, #8]
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1632 cmp r2, #1
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1633 lsl r4, r4, r2
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1634 vdup.16 q8, r4
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1635 mov r4, r0
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1636 ble 20f
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1637 rsb lr, r2, #1
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1638 vdup.16 q9, lr
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1639 cmp r3, #0
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1640 blt 10f
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1641 weight_\w vhadd.s16
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1642 10: rsb r3, r3, #0
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1643 weight_\w vhsub.s16
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1644 20: rsb lr, r2, #0
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1645 vdup.16 q9, lr
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1646 cmp r3, #0
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1647 blt 10f
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1648 weight_\w vadd.s16
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1649 10: rsb r3, r3, #0
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1650 weight_\w vsub.s16
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1651 .endfunc
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1652 .endm
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1653
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1654 .macro weight_entry w, h, b=1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1655 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1656 mov ip, #\h
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1657 .if \b
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1658 b weight_h264_pixels_\w\()_neon
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1659 .endif
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1660 .endfunc
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1661 .endm
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1662
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1663 weight_entry 16, 8
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1664 weight_entry 16, 16, b=0
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1665 weight_func 16
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1666
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1667 weight_entry 8, 16
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1668 weight_entry 8, 4
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1669 weight_entry 8, 8, b=0
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1670 weight_func 8
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1671
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1672 weight_entry 4, 8
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1673 weight_entry 4, 2
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1674 weight_entry 4, 4, b=0
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1675 weight_func 4