comparison arm/h264idct_neon.S @ 8462:0ca0e3c98ed5 libavcodec

ARM: add new h264 idct functions
author mru
date Thu, 25 Dec 2008 23:13:43 +0000
parents 9281a8a9387a
children 779a9c93bf61
comparison
equal deleted inserted replaced
8461:11307ea31e57 8462:0ca0e3c98ed5
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */ 19 */
20 20
21 #include "asm.S" 21 #include "asm.S"
22 22
23 preserve8
23 .fpu neon 24 .fpu neon
24 25
25 .text 26 .text
26 27
27 function ff_h264_idct_add_neon, export=1 28 function ff_h264_idct_add_neon, export=1
92 vst1.32 {d0[1]}, [r0,:32], r2 93 vst1.32 {d0[1]}, [r0,:32], r2
93 vst1.32 {d1[0]}, [r0,:32], r2 94 vst1.32 {d1[0]}, [r0,:32], r2
94 vst1.32 {d1[1]}, [r0,:32], r2 95 vst1.32 {d1[1]}, [r0,:32], r2
95 bx lr 96 bx lr
96 .endfunc 97 .endfunc
98
99 function ff_h264_idct_add16_neon, export=1
100 push {r4-r8,lr}
101 mov r4, r0
102 mov r5, r1
103 mov r1, r2
104 mov r2, r3
105 ldr r6, [sp, #24]
106 movw r7, #:lower16:scan8
107 movt r7, #:upper16:scan8
108 mov ip, #16
109 1: ldrb r8, [r7], #1
110 ldr r0, [r5], #4
111 ldrb r8, [r6, r8]
112 subs r8, r8, #1
113 blt 2f
114 ldrsh lr, [r1]
115 add r0, r0, r4
116 movne lr, #0
117 cmp lr, #0
118 adrne lr, ff_h264_idct_dc_add_neon
119 adreq lr, ff_h264_idct_add_neon
120 blx lr
121 2: subs ip, ip, #1
122 add r1, r1, #32
123 bne 1b
124 pop {r4-r8,pc}
125 .endfunc
126
127 function ff_h264_idct_add16intra_neon, export=1
128 push {r4-r8,lr}
129 mov r4, r0
130 mov r5, r1
131 mov r1, r2
132 mov r2, r3
133 ldr r6, [sp, #24]
134 movw r7, #:lower16:scan8
135 movt r7, #:upper16:scan8
136 mov ip, #16
137 1: ldrb r8, [r7], #1
138 ldr r0, [r5], #4
139 ldrb r8, [r6, r8]
140 add r0, r0, r4
141 cmp r8, #0
142 ldrsh r8, [r1]
143 adrne lr, ff_h264_idct_add_neon
144 adreq lr, ff_h264_idct_dc_add_neon
145 cmpeq r8, #0
146 blxne lr
147 subs ip, ip, #1
148 add r1, r1, #32
149 bne 1b
150 pop {r4-r8,pc}
151 .endfunc
152
153 function ff_h264_idct_add8_neon, export=1
154 push {r4-r10,lr}
155 ldm r0, {r4,r9}
156 add r5, r1, #16*4
157 add r1, r2, #16*32
158 mov r2, r3
159 ldr r6, [sp, #32]
160 movw r7, #:lower16:scan8+16
161 movt r7, #:upper16:scan8+16
162 mov ip, #8
163 1: ldrb r8, [r7], #1
164 ldr r0, [r5], #4
165 ldrb r8, [r6, r8]
166 tst ip, #4
167 addeq r0, r0, r4
168 addne r0, r0, r9
169 cmp r8, #0
170 ldrsh r8, [r1]
171 adrne lr, ff_h264_idct_add_neon
172 adreq lr, ff_h264_idct_dc_add_neon
173 cmpeq r8, #0
174 blxne lr
175 subs ip, ip, #1
176 add r1, r1, #32
177 bne 1b
178 pop {r4-r10,pc}
179 .endfunc
180
181 .section .rodata
182 scan8: .byte 4+1*8, 5+1*8, 4+2*8, 5+2*8
183 .byte 6+1*8, 7+1*8, 6+2*8, 7+2*8
184 .byte 4+3*8, 5+3*8, 4+4*8, 5+4*8
185 .byte 6+3*8, 7+3*8, 6+4*8, 7+4*8
186 .byte 1+1*8, 2+1*8
187 .byte 1+2*8, 2+2*8
188 .byte 1+4*8, 2+4*8
189 .byte 1+5*8, 2+5*8