2967
|
1 /*
|
61
|
2 C-like prototype :
|
2979
|
3 void j_rev_dct_ARM(DCTBLOCK data)
|
61
|
4
|
|
5 With DCTBLOCK being a pointer to an array of 64 'signed shorts'
|
|
6
|
|
7 Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
|
|
8
|
|
9 Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
10 of this software and associated documentation files (the "Software"), to deal
|
|
11 in the Software without restriction, including without limitation the rights
|
|
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
13 copies of the Software, and to permit persons to whom the Software is
|
|
14 furnished to do so, subject to the following conditions:
|
|
15
|
|
16 The above copyright notice and this permission notice shall be included in
|
|
17 all copies or substantial portions of the Software.
|
|
18
|
|
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
22 COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
|
23 IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
24 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
2967
|
25
|
61
|
26 */
|
|
27 #define FIX_0_298631336 2446
|
|
28 #define FIX_0_541196100 4433
|
|
29 #define FIX_0_765366865 6270
|
|
30 #define FIX_1_175875602 9633
|
|
31 #define FIX_1_501321110 12299
|
|
32 #define FIX_2_053119869 16819
|
|
33 #define FIX_3_072711026 25172
|
|
34 #define FIX_M_0_390180644 -3196
|
|
35 #define FIX_M_0_899976223 -7373
|
|
36 #define FIX_M_1_847759065 -15137
|
|
37 #define FIX_M_1_961570560 -16069
|
|
38 #define FIX_M_2_562915447 -20995
|
2967
|
39 #define FIX_0xFFFF 0xFFFF
|
|
40
|
61
|
41 #define FIX_0_298631336_ID 0
|
|
42 #define FIX_0_541196100_ID 4
|
|
43 #define FIX_0_765366865_ID 8
|
|
44 #define FIX_1_175875602_ID 12
|
|
45 #define FIX_1_501321110_ID 16
|
|
46 #define FIX_2_053119869_ID 20
|
|
47 #define FIX_3_072711026_ID 24
|
|
48 #define FIX_M_0_390180644_ID 28
|
|
49 #define FIX_M_0_899976223_ID 32
|
|
50 #define FIX_M_1_847759065_ID 36
|
|
51 #define FIX_M_1_961570560_ID 40
|
|
52 #define FIX_M_2_562915447_ID 44
|
|
53 #define FIX_0xFFFF_ID 48
|
2979
|
54 .text
|
|
55 .align
|
2967
|
56
|
2979
|
57 .global j_rev_dct_ARM
|
61
|
58 j_rev_dct_ARM:
|
2979
|
59 stmdb sp!, { r4 - r12, lr } @ all callee saved regs
|
61
|
60
|
2979
|
61 sub sp, sp, #4 @ reserve some space on the stack
|
|
62 str r0, [ sp ] @ save the DCT pointer to the stack
|
61
|
63
|
2979
|
64 mov lr, r0 @ lr = pointer to the current row
|
|
65 mov r12, #8 @ r12 = row-counter
|
|
66 add r11, pc, #(const_array-.-8) @ r11 = base pointer to the constants array
|
61
|
67 row_loop:
|
2979
|
68 ldrsh r0, [lr, # 0] @ r0 = 'd0'
|
|
69 ldrsh r1, [lr, # 8] @ r1 = 'd1'
|
61
|
70
|
2979
|
71 @ Optimization for row that have all items except the first set to 0
|
|
72 @ (this works as the DCTELEMS are always 4-byte aligned)
|
|
73 ldr r5, [lr, # 0]
|
|
74 ldr r2, [lr, # 4]
|
|
75 ldr r3, [lr, # 8]
|
|
76 ldr r4, [lr, #12]
|
|
77 orr r3, r3, r4
|
|
78 orr r3, r3, r2
|
|
79 orrs r5, r3, r5
|
|
80 beq end_of_row_loop @ nothing to be done as ALL of them are '0'
|
|
81 orrs r2, r3, r1
|
|
82 beq empty_row
|
2967
|
83
|
2979
|
84 ldrsh r2, [lr, # 2] @ r2 = 'd2'
|
|
85 ldrsh r4, [lr, # 4] @ r4 = 'd4'
|
|
86 ldrsh r6, [lr, # 6] @ r6 = 'd6'
|
2967
|
87
|
2979
|
88 ldr r3, [r11, #FIX_0_541196100_ID]
|
|
89 add r7, r2, r6
|
|
90 ldr r5, [r11, #FIX_M_1_847759065_ID]
|
|
91 mul r7, r3, r7 @ r7 = z1
|
|
92 ldr r3, [r11, #FIX_0_765366865_ID]
|
|
93 mla r6, r5, r6, r7 @ r6 = tmp2
|
|
94 add r5, r0, r4 @ r5 = tmp0
|
|
95 mla r2, r3, r2, r7 @ r2 = tmp3
|
|
96 sub r3, r0, r4 @ r3 = tmp1
|
61
|
97
|
2979
|
98 add r0, r2, r5, lsl #13 @ r0 = tmp10
|
|
99 rsb r2, r2, r5, lsl #13 @ r2 = tmp13
|
|
100 add r4, r6, r3, lsl #13 @ r4 = tmp11
|
|
101 rsb r3, r6, r3, lsl #13 @ r3 = tmp12
|
61
|
102
|
2979
|
103 stmdb sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11
|
2967
|
104
|
2979
|
105 ldrsh r3, [lr, #10] @ r3 = 'd3'
|
|
106 ldrsh r5, [lr, #12] @ r5 = 'd5'
|
|
107 ldrsh r7, [lr, #14] @ r7 = 'd7'
|
61
|
108
|
2979
|
109 add r0, r3, r5 @ r0 = 'z2'
|
|
110 add r2, r1, r7 @ r2 = 'z1'
|
|
111 add r4, r3, r7 @ r4 = 'z3'
|
|
112 add r6, r1, r5 @ r6 = 'z4'
|
|
113 ldr r9, [r11, #FIX_1_175875602_ID]
|
|
114 add r8, r4, r6 @ r8 = z3 + z4
|
|
115 ldr r10, [r11, #FIX_M_0_899976223_ID]
|
|
116 mul r8, r9, r8 @ r8 = 'z5'
|
|
117 ldr r9, [r11, #FIX_M_2_562915447_ID]
|
|
118 mul r2, r10, r2 @ r2 = 'z1'
|
|
119 ldr r10, [r11, #FIX_M_1_961570560_ID]
|
|
120 mul r0, r9, r0 @ r0 = 'z2'
|
|
121 ldr r9, [r11, #FIX_M_0_390180644_ID]
|
|
122 mla r4, r10, r4, r8 @ r4 = 'z3'
|
|
123 ldr r10, [r11, #FIX_0_298631336_ID]
|
|
124 mla r6, r9, r6, r8 @ r6 = 'z4'
|
|
125 ldr r9, [r11, #FIX_2_053119869_ID]
|
|
126 mla r7, r10, r7, r2 @ r7 = tmp0 + z1
|
|
127 ldr r10, [r11, #FIX_3_072711026_ID]
|
|
128 mla r5, r9, r5, r0 @ r5 = tmp1 + z2
|
|
129 ldr r9, [r11, #FIX_1_501321110_ID]
|
|
130 mla r3, r10, r3, r0 @ r3 = tmp2 + z2
|
|
131 add r7, r7, r4 @ r7 = tmp0
|
|
132 mla r1, r9, r1, r2 @ r1 = tmp3 + z1
|
|
133 add r5, r5, r6 @ r5 = tmp1
|
|
134 add r3, r3, r4 @ r3 = tmp2
|
|
135 add r1, r1, r6 @ r1 = tmp3
|
61
|
136
|
2979
|
137 ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
|
|
138 @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
|
2967
|
139
|
2979
|
140 @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
|
|
141 add r8, r0, r1
|
|
142 add r8, r8, #(1<<10)
|
|
143 mov r8, r8, asr #11
|
|
144 strh r8, [lr, # 0]
|
2967
|
145
|
2979
|
146 @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
|
|
147 sub r8, r0, r1
|
|
148 add r8, r8, #(1<<10)
|
|
149 mov r8, r8, asr #11
|
|
150 strh r8, [lr, #14]
|
2967
|
151
|
2979
|
152 @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
|
|
153 add r8, r6, r3
|
|
154 add r8, r8, #(1<<10)
|
|
155 mov r8, r8, asr #11
|
|
156 strh r8, [lr, # 2]
|
2967
|
157
|
2979
|
158 @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
|
|
159 sub r8, r6, r3
|
|
160 add r8, r8, #(1<<10)
|
|
161 mov r8, r8, asr #11
|
|
162 strh r8, [lr, #12]
|
2967
|
163
|
2979
|
164 @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
|
|
165 add r8, r4, r5
|
|
166 add r8, r8, #(1<<10)
|
|
167 mov r8, r8, asr #11
|
|
168 strh r8, [lr, # 4]
|
2967
|
169
|
2979
|
170 @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
|
|
171 sub r8, r4, r5
|
|
172 add r8, r8, #(1<<10)
|
|
173 mov r8, r8, asr #11
|
|
174 strh r8, [lr, #10]
|
2967
|
175
|
2979
|
176 @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
|
|
177 add r8, r2, r7
|
|
178 add r8, r8, #(1<<10)
|
|
179 mov r8, r8, asr #11
|
|
180 strh r8, [lr, # 6]
|
2967
|
181
|
2979
|
182 @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
|
|
183 sub r8, r2, r7
|
|
184 add r8, r8, #(1<<10)
|
|
185 mov r8, r8, asr #11
|
|
186 strh r8, [lr, # 8]
|
61
|
187
|
2979
|
188 @ End of row loop
|
|
189 add lr, lr, #16
|
|
190 subs r12, r12, #1
|
|
191 bne row_loop
|
|
192 beq start_column_loop
|
2967
|
193
|
61
|
194 empty_row:
|
2979
|
195 ldr r1, [r11, #FIX_0xFFFF_ID]
|
|
196 mov r0, r0, lsl #2
|
|
197 and r0, r0, r1
|
|
198 add r0, r0, r0, lsl #16
|
|
199 str r0, [lr, # 0]
|
|
200 str r0, [lr, # 4]
|
|
201 str r0, [lr, # 8]
|
|
202 str r0, [lr, #12]
|
61
|
203
|
|
204 end_of_row_loop:
|
2979
|
205 @ End of loop
|
|
206 add lr, lr, #16
|
|
207 subs r12, r12, #1
|
|
208 bne row_loop
|
61
|
209
|
|
210 start_column_loop:
|
2979
|
211 @ Start of column loop
|
|
212 ldr lr, [ sp ]
|
|
213 mov r12, #8
|
61
|
214 column_loop:
|
2979
|
215 ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0'
|
|
216 ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2'
|
|
217 ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4'
|
|
218 ldrsh r6, [lr, #(12*8)] @ r6 = 'd6'
|
61
|
219
|
2979
|
220 ldr r3, [r11, #FIX_0_541196100_ID]
|
|
221 add r1, r2, r6
|
|
222 ldr r5, [r11, #FIX_M_1_847759065_ID]
|
|
223 mul r1, r3, r1 @ r1 = z1
|
|
224 ldr r3, [r11, #FIX_0_765366865_ID]
|
|
225 mla r6, r5, r6, r1 @ r6 = tmp2
|
|
226 add r5, r0, r4 @ r5 = tmp0
|
|
227 mla r2, r3, r2, r1 @ r2 = tmp3
|
|
228 sub r3, r0, r4 @ r3 = tmp1
|
61
|
229
|
2979
|
230 add r0, r2, r5, lsl #13 @ r0 = tmp10
|
|
231 rsb r2, r2, r5, lsl #13 @ r2 = tmp13
|
|
232 add r4, r6, r3, lsl #13 @ r4 = tmp11
|
|
233 rsb r6, r6, r3, lsl #13 @ r6 = tmp12
|
61
|
234
|
2979
|
235 ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1'
|
|
236 ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3'
|
|
237 ldrsh r5, [lr, #(10*8)] @ r5 = 'd5'
|
|
238 ldrsh r7, [lr, #(14*8)] @ r7 = 'd7'
|
61
|
239
|
2979
|
240 @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
|
|
241 orr r9, r1, r3
|
|
242 orr r10, r5, r7
|
|
243 orrs r10, r9, r10
|
|
244 beq empty_odd_column
|
61
|
245
|
2979
|
246 stmdb sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11
|
2967
|
247
|
2979
|
248 add r0, r3, r5 @ r0 = 'z2'
|
|
249 add r2, r1, r7 @ r2 = 'z1'
|
|
250 add r4, r3, r7 @ r4 = 'z3'
|
|
251 add r6, r1, r5 @ r6 = 'z4'
|
|
252 ldr r9, [r11, #FIX_1_175875602_ID]
|
|
253 add r8, r4, r6
|
|
254 ldr r10, [r11, #FIX_M_0_899976223_ID]
|
|
255 mul r8, r9, r8 @ r8 = 'z5'
|
|
256 ldr r9, [r11, #FIX_M_2_562915447_ID]
|
|
257 mul r2, r10, r2 @ r2 = 'z1'
|
|
258 ldr r10, [r11, #FIX_M_1_961570560_ID]
|
|
259 mul r0, r9, r0 @ r0 = 'z2'
|
|
260 ldr r9, [r11, #FIX_M_0_390180644_ID]
|
|
261 mla r4, r10, r4, r8 @ r4 = 'z3'
|
|
262 ldr r10, [r11, #FIX_0_298631336_ID]
|
|
263 mla r6, r9, r6, r8 @ r6 = 'z4'
|
|
264 ldr r9, [r11, #FIX_2_053119869_ID]
|
|
265 mla r7, r10, r7, r2 @ r7 = tmp0 + z1
|
|
266 ldr r10, [r11, #FIX_3_072711026_ID]
|
|
267 mla r5, r9, r5, r0 @ r5 = tmp1 + z2
|
|
268 ldr r9, [r11, #FIX_1_501321110_ID]
|
|
269 mla r3, r10, r3, r0 @ r3 = tmp2 + z2
|
|
270 add r7, r7, r4 @ r7 = tmp0
|
|
271 mla r1, r9, r1, r2 @ r1 = tmp3 + z1
|
|
272 add r5, r5, r6 @ r5 = tmp1
|
|
273 add r3, r3, r4 @ r3 = tmp2
|
|
274 add r1, r1, r6 @ r1 = tmp3
|
2967
|
275
|
2979
|
276 ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
|
|
277 @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
|
61
|
278
|
2979
|
279 @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
|
|
280 add r8, r0, r1
|
|
281 add r8, r8, #(1<<17)
|
|
282 mov r8, r8, asr #18
|
|
283 strh r8, [lr, #( 0*8)]
|
2967
|
284
|
2979
|
285 @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
|
|
286 sub r8, r0, r1
|
|
287 add r8, r8, #(1<<17)
|
|
288 mov r8, r8, asr #18
|
|
289 strh r8, [lr, #(14*8)]
|
2967
|
290
|
2979
|
291 @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
|
|
292 add r8, r4, r3
|
|
293 add r8, r8, #(1<<17)
|
|
294 mov r8, r8, asr #18
|
|
295 strh r8, [lr, #( 2*8)]
|
2967
|
296
|
2979
|
297 @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
|
|
298 sub r8, r4, r3
|
|
299 add r8, r8, #(1<<17)
|
|
300 mov r8, r8, asr #18
|
|
301 strh r8, [lr, #(12*8)]
|
2967
|
302
|
2979
|
303 @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
|
|
304 add r8, r6, r5
|
|
305 add r8, r8, #(1<<17)
|
|
306 mov r8, r8, asr #18
|
|
307 strh r8, [lr, #( 4*8)]
|
2967
|
308
|
2979
|
309 @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
|
|
310 sub r8, r6, r5
|
|
311 add r8, r8, #(1<<17)
|
|
312 mov r8, r8, asr #18
|
|
313 strh r8, [lr, #(10*8)]
|
2967
|
314
|
2979
|
315 @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
|
|
316 add r8, r2, r7
|
|
317 add r8, r8, #(1<<17)
|
|
318 mov r8, r8, asr #18
|
|
319 strh r8, [lr, #( 6*8)]
|
2967
|
320
|
2979
|
321 @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
|
|
322 sub r8, r2, r7
|
|
323 add r8, r8, #(1<<17)
|
|
324 mov r8, r8, asr #18
|
|
325 strh r8, [lr, #( 8*8)]
|
61
|
326
|
2979
|
327 @ End of row loop
|
|
328 add lr, lr, #2
|
|
329 subs r12, r12, #1
|
|
330 bne column_loop
|
|
331 beq the_end
|
2967
|
332
|
61
|
333 empty_odd_column:
|
2979
|
334 @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
|
|
335 @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
|
|
336 add r0, r0, #(1<<17)
|
|
337 mov r0, r0, asr #18
|
|
338 strh r0, [lr, #( 0*8)]
|
|
339 strh r0, [lr, #(14*8)]
|
2967
|
340
|
2979
|
341 @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
|
|
342 @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
|
|
343 add r4, r4, #(1<<17)
|
|
344 mov r4, r4, asr #18
|
|
345 strh r4, [lr, #( 2*8)]
|
|
346 strh r4, [lr, #(12*8)]
|
2967
|
347
|
2979
|
348 @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
|
|
349 @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
|
|
350 add r6, r6, #(1<<17)
|
|
351 mov r6, r6, asr #18
|
|
352 strh r6, [lr, #( 4*8)]
|
|
353 strh r6, [lr, #(10*8)]
|
2967
|
354
|
2979
|
355 @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
|
|
356 @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
|
|
357 add r2, r2, #(1<<17)
|
|
358 mov r2, r2, asr #18
|
|
359 strh r2, [lr, #( 6*8)]
|
|
360 strh r2, [lr, #( 8*8)]
|
61
|
361
|
2979
|
362 @ End of row loop
|
|
363 add lr, lr, #2
|
|
364 subs r12, r12, #1
|
|
365 bne column_loop
|
2967
|
366
|
|
367 the_end:
|
2979
|
368 @ The end....
|
|
369 add sp, sp, #4
|
|
370 ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return
|
61
|
371
|
|
372 const_array:
|
2979
|
373 .align
|
|
374 .word FIX_0_298631336
|
|
375 .word FIX_0_541196100
|
|
376 .word FIX_0_765366865
|
|
377 .word FIX_1_175875602
|
|
378 .word FIX_1_501321110
|
|
379 .word FIX_2_053119869
|
|
380 .word FIX_3_072711026
|
|
381 .word FIX_M_0_390180644
|
|
382 .word FIX_M_0_899976223
|
|
383 .word FIX_M_1_847759065
|
|
384 .word FIX_M_1_961570560
|
|
385 .word FIX_M_2_562915447
|
|
386 .word FIX_0xFFFF
|