comparison arm/jrevdct_arm.S @ 8359:9281a8a9387a libavcodec

ARM: replace "armv4l" with "arm"
author mru
date Wed, 17 Dec 2008 00:54:54 +0000
parents armv4l/jrevdct_arm.S@c45366b01126
children 989ea69f6a4e
comparison
equal deleted inserted replaced
8358:c30b92cf446b 8359:9281a8a9387a
1 /*
2 C-like prototype :
3 void j_rev_dct_ARM(DCTBLOCK data)
4
5 With DCTBLOCK being a pointer to an array of 64 'signed shorts'
6
7 Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
8
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
23 IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
24 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 */
27
28 #include "asm.S"
29
30 #define FIX_0_298631336 2446
31 #define FIX_0_541196100 4433
32 #define FIX_0_765366865 6270
33 #define FIX_1_175875602 9633
34 #define FIX_1_501321110 12299
35 #define FIX_2_053119869 16819
36 #define FIX_3_072711026 25172
37 #define FIX_M_0_390180644 -3196
38 #define FIX_M_0_899976223 -7373
39 #define FIX_M_1_847759065 -15137
40 #define FIX_M_1_961570560 -16069
41 #define FIX_M_2_562915447 -20995
42 #define FIX_0xFFFF 0xFFFF
43
44 #define FIX_0_298631336_ID 0
45 #define FIX_0_541196100_ID 4
46 #define FIX_0_765366865_ID 8
47 #define FIX_1_175875602_ID 12
48 #define FIX_1_501321110_ID 16
49 #define FIX_2_053119869_ID 20
50 #define FIX_3_072711026_ID 24
51 #define FIX_M_0_390180644_ID 28
52 #define FIX_M_0_899976223_ID 32
53 #define FIX_M_1_847759065_ID 36
54 #define FIX_M_1_961570560_ID 40
55 #define FIX_M_2_562915447_ID 44
56 #define FIX_0xFFFF_ID 48
57 .text
58 .align
59
60 function j_rev_dct_ARM, export=1
61 stmdb sp!, { r4 - r12, lr } @ all callee saved regs
62
63 sub sp, sp, #4 @ reserve some space on the stack
64 str r0, [ sp ] @ save the DCT pointer to the stack
65
66 mov lr, r0 @ lr = pointer to the current row
67 mov r12, #8 @ r12 = row-counter
68 add r11, pc, #(const_array-.-8) @ r11 = base pointer to the constants array
69 row_loop:
70 ldrsh r0, [lr, # 0] @ r0 = 'd0'
71 ldrsh r2, [lr, # 2] @ r2 = 'd2'
72
73 @ Optimization for row that have all items except the first set to 0
74 @ (this works as the DCTELEMS are always 4-byte aligned)
75 ldr r5, [lr, # 0]
76 ldr r6, [lr, # 4]
77 ldr r3, [lr, # 8]
78 ldr r4, [lr, #12]
79 orr r3, r3, r4
80 orr r3, r3, r6
81 orrs r5, r3, r5
82 beq end_of_row_loop @ nothing to be done as ALL of them are '0'
83 orrs r3, r3, r2
84 beq empty_row
85
86 ldrsh r1, [lr, # 8] @ r1 = 'd1'
87 ldrsh r4, [lr, # 4] @ r4 = 'd4'
88 ldrsh r6, [lr, # 6] @ r6 = 'd6'
89
90 ldr r3, [r11, #FIX_0_541196100_ID]
91 add r7, r2, r6
92 ldr r5, [r11, #FIX_M_1_847759065_ID]
93 mul r7, r3, r7 @ r7 = z1
94 ldr r3, [r11, #FIX_0_765366865_ID]
95 mla r6, r5, r6, r7 @ r6 = tmp2
96 add r5, r0, r4 @ r5 = tmp0
97 mla r2, r3, r2, r7 @ r2 = tmp3
98 sub r3, r0, r4 @ r3 = tmp1
99
100 add r0, r2, r5, lsl #13 @ r0 = tmp10
101 rsb r2, r2, r5, lsl #13 @ r2 = tmp13
102 add r4, r6, r3, lsl #13 @ r4 = tmp11
103 rsb r3, r6, r3, lsl #13 @ r3 = tmp12
104
105 stmdb sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11
106
107 ldrsh r3, [lr, #10] @ r3 = 'd3'
108 ldrsh r5, [lr, #12] @ r5 = 'd5'
109 ldrsh r7, [lr, #14] @ r7 = 'd7'
110
111 add r0, r3, r5 @ r0 = 'z2'
112 add r2, r1, r7 @ r2 = 'z1'
113 add r4, r3, r7 @ r4 = 'z3'
114 add r6, r1, r5 @ r6 = 'z4'
115 ldr r9, [r11, #FIX_1_175875602_ID]
116 add r8, r4, r6 @ r8 = z3 + z4
117 ldr r10, [r11, #FIX_M_0_899976223_ID]
118 mul r8, r9, r8 @ r8 = 'z5'
119 ldr r9, [r11, #FIX_M_2_562915447_ID]
120 mul r2, r10, r2 @ r2 = 'z1'
121 ldr r10, [r11, #FIX_M_1_961570560_ID]
122 mul r0, r9, r0 @ r0 = 'z2'
123 ldr r9, [r11, #FIX_M_0_390180644_ID]
124 mla r4, r10, r4, r8 @ r4 = 'z3'
125 ldr r10, [r11, #FIX_0_298631336_ID]
126 mla r6, r9, r6, r8 @ r6 = 'z4'
127 ldr r9, [r11, #FIX_2_053119869_ID]
128 mla r7, r10, r7, r2 @ r7 = tmp0 + z1
129 ldr r10, [r11, #FIX_3_072711026_ID]
130 mla r5, r9, r5, r0 @ r5 = tmp1 + z2
131 ldr r9, [r11, #FIX_1_501321110_ID]
132 mla r3, r10, r3, r0 @ r3 = tmp2 + z2
133 add r7, r7, r4 @ r7 = tmp0
134 mla r1, r9, r1, r2 @ r1 = tmp3 + z1
135 add r5, r5, r6 @ r5 = tmp1
136 add r3, r3, r4 @ r3 = tmp2
137 add r1, r1, r6 @ r1 = tmp3
138
139 ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
140 @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
141
142 @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
143 add r8, r0, r1
144 add r8, r8, #(1<<10)
145 mov r8, r8, asr #11
146 strh r8, [lr, # 0]
147
148 @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
149 sub r8, r0, r1
150 add r8, r8, #(1<<10)
151 mov r8, r8, asr #11
152 strh r8, [lr, #14]
153
154 @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
155 add r8, r6, r3
156 add r8, r8, #(1<<10)
157 mov r8, r8, asr #11
158 strh r8, [lr, # 2]
159
160 @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
161 sub r8, r6, r3
162 add r8, r8, #(1<<10)
163 mov r8, r8, asr #11
164 strh r8, [lr, #12]
165
166 @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
167 add r8, r4, r5
168 add r8, r8, #(1<<10)
169 mov r8, r8, asr #11
170 strh r8, [lr, # 4]
171
172 @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
173 sub r8, r4, r5
174 add r8, r8, #(1<<10)
175 mov r8, r8, asr #11
176 strh r8, [lr, #10]
177
178 @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
179 add r8, r2, r7
180 add r8, r8, #(1<<10)
181 mov r8, r8, asr #11
182 strh r8, [lr, # 6]
183
184 @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
185 sub r8, r2, r7
186 add r8, r8, #(1<<10)
187 mov r8, r8, asr #11
188 strh r8, [lr, # 8]
189
190 @ End of row loop
191 add lr, lr, #16
192 subs r12, r12, #1
193 bne row_loop
194 beq start_column_loop
195
196 empty_row:
197 ldr r1, [r11, #FIX_0xFFFF_ID]
198 mov r0, r0, lsl #2
199 and r0, r0, r1
200 add r0, r0, r0, lsl #16
201 str r0, [lr, # 0]
202 str r0, [lr, # 4]
203 str r0, [lr, # 8]
204 str r0, [lr, #12]
205
206 end_of_row_loop:
207 @ End of loop
208 add lr, lr, #16
209 subs r12, r12, #1
210 bne row_loop
211
212 start_column_loop:
213 @ Start of column loop
214 ldr lr, [ sp ]
215 mov r12, #8
216 column_loop:
217 ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0'
218 ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2'
219 ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4'
220 ldrsh r6, [lr, #(12*8)] @ r6 = 'd6'
221
222 ldr r3, [r11, #FIX_0_541196100_ID]
223 add r1, r2, r6
224 ldr r5, [r11, #FIX_M_1_847759065_ID]
225 mul r1, r3, r1 @ r1 = z1
226 ldr r3, [r11, #FIX_0_765366865_ID]
227 mla r6, r5, r6, r1 @ r6 = tmp2
228 add r5, r0, r4 @ r5 = tmp0
229 mla r2, r3, r2, r1 @ r2 = tmp3
230 sub r3, r0, r4 @ r3 = tmp1
231
232 add r0, r2, r5, lsl #13 @ r0 = tmp10
233 rsb r2, r2, r5, lsl #13 @ r2 = tmp13
234 add r4, r6, r3, lsl #13 @ r4 = tmp11
235 rsb r6, r6, r3, lsl #13 @ r6 = tmp12
236
237 ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1'
238 ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3'
239 ldrsh r5, [lr, #(10*8)] @ r5 = 'd5'
240 ldrsh r7, [lr, #(14*8)] @ r7 = 'd7'
241
242 @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
243 orr r9, r1, r3
244 orr r10, r5, r7
245 orrs r10, r9, r10
246 beq empty_odd_column
247
248 stmdb sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11
249
250 add r0, r3, r5 @ r0 = 'z2'
251 add r2, r1, r7 @ r2 = 'z1'
252 add r4, r3, r7 @ r4 = 'z3'
253 add r6, r1, r5 @ r6 = 'z4'
254 ldr r9, [r11, #FIX_1_175875602_ID]
255 add r8, r4, r6
256 ldr r10, [r11, #FIX_M_0_899976223_ID]
257 mul r8, r9, r8 @ r8 = 'z5'
258 ldr r9, [r11, #FIX_M_2_562915447_ID]
259 mul r2, r10, r2 @ r2 = 'z1'
260 ldr r10, [r11, #FIX_M_1_961570560_ID]
261 mul r0, r9, r0 @ r0 = 'z2'
262 ldr r9, [r11, #FIX_M_0_390180644_ID]
263 mla r4, r10, r4, r8 @ r4 = 'z3'
264 ldr r10, [r11, #FIX_0_298631336_ID]
265 mla r6, r9, r6, r8 @ r6 = 'z4'
266 ldr r9, [r11, #FIX_2_053119869_ID]
267 mla r7, r10, r7, r2 @ r7 = tmp0 + z1
268 ldr r10, [r11, #FIX_3_072711026_ID]
269 mla r5, r9, r5, r0 @ r5 = tmp1 + z2
270 ldr r9, [r11, #FIX_1_501321110_ID]
271 mla r3, r10, r3, r0 @ r3 = tmp2 + z2
272 add r7, r7, r4 @ r7 = tmp0
273 mla r1, r9, r1, r2 @ r1 = tmp3 + z1
274 add r5, r5, r6 @ r5 = tmp1
275 add r3, r3, r4 @ r3 = tmp2
276 add r1, r1, r6 @ r1 = tmp3
277
278 ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
279 @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
280
281 @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
282 add r8, r0, r1
283 add r8, r8, #(1<<17)
284 mov r8, r8, asr #18
285 strh r8, [lr, #( 0*8)]
286
287 @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
288 sub r8, r0, r1
289 add r8, r8, #(1<<17)
290 mov r8, r8, asr #18
291 strh r8, [lr, #(14*8)]
292
293 @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
294 add r8, r4, r3
295 add r8, r8, #(1<<17)
296 mov r8, r8, asr #18
297 strh r8, [lr, #( 2*8)]
298
299 @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
300 sub r8, r4, r3
301 add r8, r8, #(1<<17)
302 mov r8, r8, asr #18
303 strh r8, [lr, #(12*8)]
304
305 @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
306 add r8, r6, r5
307 add r8, r8, #(1<<17)
308 mov r8, r8, asr #18
309 strh r8, [lr, #( 4*8)]
310
311 @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
312 sub r8, r6, r5
313 add r8, r8, #(1<<17)
314 mov r8, r8, asr #18
315 strh r8, [lr, #(10*8)]
316
317 @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
318 add r8, r2, r7
319 add r8, r8, #(1<<17)
320 mov r8, r8, asr #18
321 strh r8, [lr, #( 6*8)]
322
323 @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
324 sub r8, r2, r7
325 add r8, r8, #(1<<17)
326 mov r8, r8, asr #18
327 strh r8, [lr, #( 8*8)]
328
329 @ End of row loop
330 add lr, lr, #2
331 subs r12, r12, #1
332 bne column_loop
333 beq the_end
334
335 empty_odd_column:
336 @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
337 @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
338 add r0, r0, #(1<<17)
339 mov r0, r0, asr #18
340 strh r0, [lr, #( 0*8)]
341 strh r0, [lr, #(14*8)]
342
343 @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
344 @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
345 add r4, r4, #(1<<17)
346 mov r4, r4, asr #18
347 strh r4, [lr, #( 2*8)]
348 strh r4, [lr, #(12*8)]
349
350 @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
351 @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
352 add r6, r6, #(1<<17)
353 mov r6, r6, asr #18
354 strh r6, [lr, #( 4*8)]
355 strh r6, [lr, #(10*8)]
356
357 @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
358 @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
359 add r2, r2, #(1<<17)
360 mov r2, r2, asr #18
361 strh r2, [lr, #( 6*8)]
362 strh r2, [lr, #( 8*8)]
363
364 @ End of row loop
365 add lr, lr, #2
366 subs r12, r12, #1
367 bne column_loop
368
369 the_end:
370 @ The end....
371 add sp, sp, #4
372 ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return
373
374 const_array:
375 .align
376 .word FIX_0_298631336
377 .word FIX_0_541196100
378 .word FIX_0_765366865
379 .word FIX_1_175875602
380 .word FIX_1_501321110
381 .word FIX_2_053119869
382 .word FIX_3_072711026
383 .word FIX_M_0_390180644
384 .word FIX_M_0_899976223
385 .word FIX_M_1_847759065
386 .word FIX_M_1_961570560
387 .word FIX_M_2_562915447
388 .word FIX_0xFFFF