comparison armv4l/jrevdct_arm.S @ 61:fefaa96def6e libavcodec

arm specific code
author glantau
date Mon, 13 Aug 2001 21:38:25 +0000
parents
children ef2149182f1c
comparison
equal deleted inserted replaced
60:35c1141e23d9 61:fefaa96def6e
1 /*
2 C-like prototype :
3 void j_rev_dct_ARM(DCTBLOCK data)
4
5 With DCTBLOCK being a pointer to an array of 64 'signed shorts'
6
7 Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
8
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
23 IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
24 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 */
27 #define FIX_0_298631336 2446
28 #define FIX_0_541196100 4433
29 #define FIX_0_765366865 6270
30 #define FIX_1_175875602 9633
31 #define FIX_1_501321110 12299
32 #define FIX_2_053119869 16819
33 #define FIX_3_072711026 25172
34 #define FIX_M_0_390180644 -3196
35 #define FIX_M_0_899976223 -7373
36 #define FIX_M_1_847759065 -15137
37 #define FIX_M_1_961570560 -16069
38 #define FIX_M_2_562915447 -20995
39 #define FIX_0xFFFF 0xFFFF
40
41 #define FIX_0_298631336_ID 0
42 #define FIX_0_541196100_ID 4
43 #define FIX_0_765366865_ID 8
44 #define FIX_1_175875602_ID 12
45 #define FIX_1_501321110_ID 16
46 #define FIX_2_053119869_ID 20
47 #define FIX_3_072711026_ID 24
48 #define FIX_M_0_390180644_ID 28
49 #define FIX_M_0_899976223_ID 32
50 #define FIX_M_1_847759065_ID 36
51 #define FIX_M_1_961570560_ID 40
52 #define FIX_M_2_562915447_ID 44
53 #define FIX_0xFFFF_ID 48
54 .text
55 .align
56
57 .global j_rev_dct_ARM
58 j_rev_dct_ARM:
59 stmdb sp!, { r4 - r12, lr } @ all callee saved regs
60
61 sub sp, sp, #4 @ reserve some space on the stack
62 str r0, [ sp ] @ save the DCT pointer to the stack
63
64 mov lr, r0 @ lr = pointer to the current row
65 mov r12, #8 @ r12 = row-counter
66 add r11, pc, #(const_array-.-8) @ r11 = base pointer to the constants array
67 row_loop:
68 ldrsh r0, [lr, # 0] @ r0 = 'd0'
69 ldrsh r1, [lr, # 8] @ r1 = 'd1'
70
71 @ Optimization for row that have all items except the first set to 0
72 @ (this works as the DCTELEMS are always 4-byte aligned)
73 ldr r5, [lr, # 0]
74 ldr r2, [lr, # 4]
75 ldr r3, [lr, # 8]
76 ldr r4, [lr, #12]
77 orr r3, r3, r4
78 orr r3, r3, r2
79 orrs r5, r3, r5
80 beq end_of_row_loop @ nothing to be done as ALL of them are '0'
81 orrs r2, r3, r1
82 beq empty_row
83
84 ldrsh r2, [lr, # 2] @ r2 = 'd2'
85 ldrsh r4, [lr, # 4] @ r4 = 'd4'
86 ldrsh r6, [lr, # 6] @ r6 = 'd6'
87
88 ldr r3, [r11, #FIX_0_541196100_ID]
89 add r7, r2, r6
90 ldr r5, [r11, #FIX_M_1_847759065_ID]
91 mul r7, r3, r7 @ r7 = z1
92 ldr r3, [r11, #FIX_0_765366865_ID]
93 mla r6, r5, r6, r7 @ r6 = tmp2
94 add r5, r0, r4 @ r5 = tmp0
95 mla r2, r3, r2, r7 @ r2 = tmp3
96 sub r3, r0, r4 @ r3 = tmp1
97
98 add r0, r2, r5, lsl #13 @ r0 = tmp10
99 rsb r2, r2, r5, lsl #13 @ r2 = tmp13
100 add r4, r6, r3, lsl #13 @ r4 = tmp11
101 rsb r3, r6, r3, lsl #13 @ r3 = tmp12
102
103 stmdb sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11
104
105 ldrsh r3, [lr, #10] @ r3 = 'd3'
106 ldrsh r5, [lr, #12] @ r5 = 'd5'
107 ldrsh r7, [lr, #14] @ r7 = 'd7'
108
109 add r0, r3, r5 @ r0 = 'z2'
110 add r2, r1, r7 @ r2 = 'z1'
111 add r4, r3, r7 @ r4 = 'z3'
112 add r6, r1, r5 @ r6 = 'z4'
113 ldr r9, [r11, #FIX_1_175875602_ID]
114 add r8, r4, r6 @ r8 = z3 + z4
115 ldr r10, [r11, #FIX_M_0_899976223_ID]
116 mul r8, r9, r8 @ r8 = 'z5'
117 ldr r9, [r11, #FIX_M_2_562915447_ID]
118 mul r2, r10, r2 @ r2 = 'z1'
119 ldr r10, [r11, #FIX_M_1_961570560_ID]
120 mul r0, r9, r0 @ r0 = 'z2'
121 ldr r9, [r11, #FIX_M_0_390180644_ID]
122 mla r4, r10, r4, r8 @ r4 = 'z3'
123 ldr r10, [r11, #FIX_0_298631336_ID]
124 mla r6, r9, r6, r8 @ r6 = 'z4'
125 ldr r9, [r11, #FIX_2_053119869_ID]
126 mla r7, r10, r7, r2 @ r7 = tmp0 + z1
127 ldr r10, [r11, #FIX_3_072711026_ID]
128 mla r5, r9, r5, r0 @ r5 = tmp1 + z2
129 ldr r9, [r11, #FIX_1_501321110_ID]
130 mla r3, r10, r3, r0 @ r3 = tmp2 + z2
131 add r7, r7, r4 @ r7 = tmp0
132 mla r1, r9, r1, r2 @ r1 = tmp3 + z1
133 add r5, r5, r6 @ r5 = tmp1
134 add r3, r3, r4 @ r3 = tmp2
135 add r1, r1, r6 @ r1 = tmp3
136
137 ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
138 @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
139
140 @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
141 add r8, r0, r1
142 add r8, r8, #(1<<10)
143 mov r8, r8, asr #11
144 strh r8, [lr, # 0]
145
146 @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
147 sub r8, r0, r1
148 add r8, r8, #(1<<10)
149 mov r8, r8, asr #11
150 strh r8, [lr, #14]
151
152 @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
153 add r8, r6, r3
154 add r8, r8, #(1<<10)
155 mov r8, r8, asr #11
156 strh r8, [lr, # 2]
157
158 @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
159 sub r8, r6, r3
160 add r8, r8, #(1<<10)
161 mov r8, r8, asr #11
162 strh r8, [lr, #12]
163
164 @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
165 add r8, r4, r5
166 add r8, r8, #(1<<10)
167 mov r8, r8, asr #11
168 strh r8, [lr, # 4]
169
170 @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
171 sub r8, r4, r5
172 add r8, r8, #(1<<10)
173 mov r8, r8, asr #11
174 strh r8, [lr, #10]
175
176 @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
177 add r8, r2, r7
178 add r8, r8, #(1<<10)
179 mov r8, r8, asr #11
180 strh r8, [lr, # 6]
181
182 @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
183 sub r8, r2, r7
184 add r8, r8, #(1<<10)
185 mov r8, r8, asr #11
186 strh r8, [lr, # 8]
187
188 @ End of row loop
189 add lr, lr, #16
190 subs r12, r12, #1
191 bne row_loop
192 beq start_column_loop
193
194 empty_row:
195 ldr r1, [r11, #FIX_0xFFFF_ID]
196 mov r0, r0, lsl #2
197 and r0, r0, r1
198 add r0, r0, r0, lsl #16
199 str r0, [lr, # 0]
200 str r0, [lr, # 4]
201 str r0, [lr, # 8]
202 str r0, [lr, #12]
203
204 end_of_row_loop:
205 @ End of loop
206 add lr, lr, #16
207 subs r12, r12, #1
208 bne row_loop
209
210 start_column_loop:
211 @ Start of column loop
212 ldr lr, [ sp ]
213 mov r12, #8
214 column_loop:
215 ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0'
216 ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2'
217 ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4'
218 ldrsh r6, [lr, #(12*8)] @ r6 = 'd6'
219
220 ldr r3, [r11, #FIX_0_541196100_ID]
221 add r1, r2, r6
222 ldr r5, [r11, #FIX_M_1_847759065_ID]
223 mul r1, r3, r1 @ r1 = z1
224 ldr r3, [r11, #FIX_0_765366865_ID]
225 mla r6, r5, r6, r1 @ r6 = tmp2
226 add r5, r0, r4 @ r5 = tmp0
227 mla r2, r3, r2, r1 @ r2 = tmp3
228 sub r3, r0, r4 @ r3 = tmp1
229
230 add r0, r2, r5, lsl #13 @ r0 = tmp10
231 rsb r2, r2, r5, lsl #13 @ r2 = tmp13
232 add r4, r6, r3, lsl #13 @ r4 = tmp11
233 rsb r6, r6, r3, lsl #13 @ r6 = tmp12
234
235 ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1'
236 ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3'
237 ldrsh r5, [lr, #(10*8)] @ r5 = 'd5'
238 ldrsh r7, [lr, #(14*8)] @ r7 = 'd7'
239
240 @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
241 orr r9, r1, r3
242 orr r10, r5, r7
243 orrs r10, r9, r10
244 beq empty_odd_column
245
246 stmdb sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11
247
248 add r0, r3, r5 @ r0 = 'z2'
249 add r2, r1, r7 @ r2 = 'z1'
250 add r4, r3, r7 @ r4 = 'z3'
251 add r6, r1, r5 @ r6 = 'z4'
252 ldr r9, [r11, #FIX_1_175875602_ID]
253 add r8, r4, r6
254 ldr r10, [r11, #FIX_M_0_899976223_ID]
255 mul r8, r9, r8 @ r8 = 'z5'
256 ldr r9, [r11, #FIX_M_2_562915447_ID]
257 mul r2, r10, r2 @ r2 = 'z1'
258 ldr r10, [r11, #FIX_M_1_961570560_ID]
259 mul r0, r9, r0 @ r0 = 'z2'
260 ldr r9, [r11, #FIX_M_0_390180644_ID]
261 mla r4, r10, r4, r8 @ r4 = 'z3'
262 ldr r10, [r11, #FIX_0_298631336_ID]
263 mla r6, r9, r6, r8 @ r6 = 'z4'
264 ldr r9, [r11, #FIX_2_053119869_ID]
265 mla r7, r10, r7, r2 @ r7 = tmp0 + z1
266 ldr r10, [r11, #FIX_3_072711026_ID]
267 mla r5, r9, r5, r0 @ r5 = tmp1 + z2
268 ldr r9, [r11, #FIX_1_501321110_ID]
269 mla r3, r10, r3, r0 @ r3 = tmp2 + z2
270 add r7, r7, r4 @ r7 = tmp0
271 mla r1, r9, r1, r2 @ r1 = tmp3 + z1
272 add r5, r5, r6 @ r5 = tmp1
273 add r3, r3, r4 @ r3 = tmp2
274 add r1, r1, r6 @ r1 = tmp3
275
276 ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
277 @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
278
279 @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
280 add r8, r0, r1
281 add r8, r8, #(1<<17)
282 mov r8, r8, asr #18
283 strh r8, [lr, #( 0*8)]
284
285 @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
286 sub r8, r0, r1
287 add r8, r8, #(1<<17)
288 mov r8, r8, asr #18
289 strh r8, [lr, #(14*8)]
290
291 @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
292 add r8, r4, r3
293 add r8, r8, #(1<<17)
294 mov r8, r8, asr #18
295 strh r8, [lr, #( 2*8)]
296
297 @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
298 sub r8, r4, r3
299 add r8, r8, #(1<<17)
300 mov r8, r8, asr #18
301 strh r8, [lr, #(12*8)]
302
303 @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
304 add r8, r6, r5
305 add r8, r8, #(1<<17)
306 mov r8, r8, asr #18
307 strh r8, [lr, #( 4*8)]
308
309 @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
310 sub r8, r6, r5
311 add r8, r8, #(1<<17)
312 mov r8, r8, asr #18
313 strh r8, [lr, #(10*8)]
314
315 @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
316 add r8, r2, r7
317 add r8, r8, #(1<<17)
318 mov r8, r8, asr #18
319 strh r8, [lr, #( 6*8)]
320
321 @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
322 sub r8, r2, r7
323 add r8, r8, #(1<<17)
324 mov r8, r8, asr #18
325 strh r8, [lr, #( 8*8)]
326
327 @ End of row loop
328 add lr, lr, #2
329 subs r12, r12, #1
330 bne column_loop
331 beq the_end
332
333 empty_odd_column:
334 @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
335 @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
336 add r0, r0, #(1<<17)
337 mov r0, r0, asr #18
338 strh r0, [lr, #( 0*8)]
339 strh r0, [lr, #(14*8)]
340
341 @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
342 @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
343 add r4, r4, #(1<<17)
344 mov r4, r4, asr #18
345 strh r4, [lr, #( 2*8)]
346 strh r4, [lr, #(12*8)]
347
348 @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
349 @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
350 add r6, r6, #(1<<17)
351 mov r6, r6, asr #18
352 strh r6, [lr, #( 4*8)]
353 strh r6, [lr, #(10*8)]
354
355 @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
356 @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
357 add r2, r2, #(1<<17)
358 mov r2, r2, asr #18
359 strh r2, [lr, #( 6*8)]
360 strh r2, [lr, #( 8*8)]
361
362 @ End of row loop
363 add lr, lr, #2
364 subs r12, r12, #1
365 bne column_loop
366
367 the_end:
368 @ The end....
369 add sp, sp, #4
370 ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return
371
372 const_array:
373 .align
374 .word FIX_0_298631336
375 .word FIX_0_541196100
376 .word FIX_0_765366865
377 .word FIX_1_175875602
378 .word FIX_1_501321110
379 .word FIX_2_053119869
380 .word FIX_3_072711026
381 .word FIX_M_0_390180644
382 .word FIX_M_0_899976223
383 .word FIX_M_1_847759065
384 .word FIX_M_1_961570560
385 .word FIX_M_2_562915447
386 .word FIX_0xFFFF