2967
|
1 /*
|
61
|
2 C-like prototype :
|
10374
|
3 void j_rev_dct_arm(DCTBLOCK data)
|
61
|
4
|
|
5 With DCTBLOCK being a pointer to an array of 64 'signed shorts'
|
|
6
|
|
7 Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
|
|
8
|
|
9 Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
10 of this software and associated documentation files (the "Software"), to deal
|
|
11 in the Software without restriction, including without limitation the rights
|
|
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
13 copies of the Software, and to permit persons to whom the Software is
|
|
14 furnished to do so, subject to the following conditions:
|
|
15
|
|
16 The above copyright notice and this permission notice shall be included in
|
|
17 all copies or substantial portions of the Software.
|
|
18
|
|
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
22 COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
|
23 IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
24 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
2967
|
25
|
61
|
26 */
|
8069
|
27
|
|
28 #include "asm.S"
|
|
29
|
61
|
30 #define FIX_0_298631336 2446
|
|
31 #define FIX_0_541196100 4433
|
|
32 #define FIX_0_765366865 6270
|
|
33 #define FIX_1_175875602 9633
|
|
34 #define FIX_1_501321110 12299
|
|
35 #define FIX_2_053119869 16819
|
|
36 #define FIX_3_072711026 25172
|
|
37 #define FIX_M_0_390180644 -3196
|
|
38 #define FIX_M_0_899976223 -7373
|
|
39 #define FIX_M_1_847759065 -15137
|
|
40 #define FIX_M_1_961570560 -16069
|
|
41 #define FIX_M_2_562915447 -20995
|
2967
|
42 #define FIX_0xFFFF 0xFFFF
|
|
43
|
61
|
44 #define FIX_0_298631336_ID 0
|
|
45 #define FIX_0_541196100_ID 4
|
|
46 #define FIX_0_765366865_ID 8
|
|
47 #define FIX_1_175875602_ID 12
|
|
48 #define FIX_1_501321110_ID 16
|
|
49 #define FIX_2_053119869_ID 20
|
|
50 #define FIX_3_072711026_ID 24
|
|
51 #define FIX_M_0_390180644_ID 28
|
|
52 #define FIX_M_0_899976223_ID 32
|
|
53 #define FIX_M_1_847759065_ID 36
|
|
54 #define FIX_M_1_961570560_ID 40
|
|
55 #define FIX_M_2_562915447_ID 44
|
|
56 #define FIX_0xFFFF_ID 48
|
2979
|
57 .text
|
|
58 .align
|
2967
|
59
|
10374
|
60 function ff_j_rev_dct_arm, export=1
|
2979
|
61 stmdb sp!, { r4 - r12, lr } @ all callee saved regs
|
61
|
62
|
2979
|
63 sub sp, sp, #4 @ reserve some space on the stack
|
|
64 str r0, [ sp ] @ save the DCT pointer to the stack
|
61
|
65
|
2979
|
66 mov lr, r0 @ lr = pointer to the current row
|
|
67 mov r12, #8 @ r12 = row-counter
|
10350
|
68 adr r11, const_array @ r11 = base pointer to the constants array
|
61
|
69 row_loop:
|
2979
|
70 ldrsh r0, [lr, # 0] @ r0 = 'd0'
|
8130
|
71 ldrsh r2, [lr, # 2] @ r2 = 'd2'
|
61
|
72
|
2979
|
73 @ Optimization for row that have all items except the first set to 0
|
|
74 @ (this works as the DCTELEMS are always 4-byte aligned)
|
|
75 ldr r5, [lr, # 0]
|
8130
|
76 ldr r6, [lr, # 4]
|
2979
|
77 ldr r3, [lr, # 8]
|
|
78 ldr r4, [lr, #12]
|
|
79 orr r3, r3, r4
|
8130
|
80 orr r3, r3, r6
|
2979
|
81 orrs r5, r3, r5
|
|
82 beq end_of_row_loop @ nothing to be done as ALL of them are '0'
|
8130
|
83 orrs r3, r3, r2
|
2979
|
84 beq empty_row
|
2967
|
85
|
8130
|
86 ldrsh r1, [lr, # 8] @ r1 = 'd1'
|
2979
|
87 ldrsh r4, [lr, # 4] @ r4 = 'd4'
|
|
88 ldrsh r6, [lr, # 6] @ r6 = 'd6'
|
2967
|
89
|
2979
|
90 ldr r3, [r11, #FIX_0_541196100_ID]
|
|
91 add r7, r2, r6
|
|
92 ldr r5, [r11, #FIX_M_1_847759065_ID]
|
|
93 mul r7, r3, r7 @ r7 = z1
|
|
94 ldr r3, [r11, #FIX_0_765366865_ID]
|
|
95 mla r6, r5, r6, r7 @ r6 = tmp2
|
|
96 add r5, r0, r4 @ r5 = tmp0
|
|
97 mla r2, r3, r2, r7 @ r2 = tmp3
|
|
98 sub r3, r0, r4 @ r3 = tmp1
|
61
|
99
|
2979
|
100 add r0, r2, r5, lsl #13 @ r0 = tmp10
|
|
101 rsb r2, r2, r5, lsl #13 @ r2 = tmp13
|
|
102 add r4, r6, r3, lsl #13 @ r4 = tmp11
|
|
103 rsb r3, r6, r3, lsl #13 @ r3 = tmp12
|
61
|
104
|
2979
|
105 stmdb sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11
|
2967
|
106
|
2979
|
107 ldrsh r3, [lr, #10] @ r3 = 'd3'
|
|
108 ldrsh r5, [lr, #12] @ r5 = 'd5'
|
|
109 ldrsh r7, [lr, #14] @ r7 = 'd7'
|
61
|
110
|
2979
|
111 add r0, r3, r5 @ r0 = 'z2'
|
|
112 add r2, r1, r7 @ r2 = 'z1'
|
|
113 add r4, r3, r7 @ r4 = 'z3'
|
|
114 add r6, r1, r5 @ r6 = 'z4'
|
|
115 ldr r9, [r11, #FIX_1_175875602_ID]
|
|
116 add r8, r4, r6 @ r8 = z3 + z4
|
|
117 ldr r10, [r11, #FIX_M_0_899976223_ID]
|
|
118 mul r8, r9, r8 @ r8 = 'z5'
|
|
119 ldr r9, [r11, #FIX_M_2_562915447_ID]
|
|
120 mul r2, r10, r2 @ r2 = 'z1'
|
|
121 ldr r10, [r11, #FIX_M_1_961570560_ID]
|
|
122 mul r0, r9, r0 @ r0 = 'z2'
|
|
123 ldr r9, [r11, #FIX_M_0_390180644_ID]
|
|
124 mla r4, r10, r4, r8 @ r4 = 'z3'
|
|
125 ldr r10, [r11, #FIX_0_298631336_ID]
|
|
126 mla r6, r9, r6, r8 @ r6 = 'z4'
|
|
127 ldr r9, [r11, #FIX_2_053119869_ID]
|
|
128 mla r7, r10, r7, r2 @ r7 = tmp0 + z1
|
|
129 ldr r10, [r11, #FIX_3_072711026_ID]
|
|
130 mla r5, r9, r5, r0 @ r5 = tmp1 + z2
|
|
131 ldr r9, [r11, #FIX_1_501321110_ID]
|
|
132 mla r3, r10, r3, r0 @ r3 = tmp2 + z2
|
|
133 add r7, r7, r4 @ r7 = tmp0
|
|
134 mla r1, r9, r1, r2 @ r1 = tmp3 + z1
|
|
135 add r5, r5, r6 @ r5 = tmp1
|
|
136 add r3, r3, r4 @ r3 = tmp2
|
|
137 add r1, r1, r6 @ r1 = tmp3
|
61
|
138
|
2979
|
139 ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
|
|
140 @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
|
2967
|
141
|
2979
|
142 @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
|
|
143 add r8, r0, r1
|
|
144 add r8, r8, #(1<<10)
|
|
145 mov r8, r8, asr #11
|
|
146 strh r8, [lr, # 0]
|
2967
|
147
|
2979
|
148 @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
|
|
149 sub r8, r0, r1
|
|
150 add r8, r8, #(1<<10)
|
|
151 mov r8, r8, asr #11
|
|
152 strh r8, [lr, #14]
|
2967
|
153
|
2979
|
154 @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
|
|
155 add r8, r6, r3
|
|
156 add r8, r8, #(1<<10)
|
|
157 mov r8, r8, asr #11
|
|
158 strh r8, [lr, # 2]
|
2967
|
159
|
2979
|
160 @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
|
|
161 sub r8, r6, r3
|
|
162 add r8, r8, #(1<<10)
|
|
163 mov r8, r8, asr #11
|
|
164 strh r8, [lr, #12]
|
2967
|
165
|
2979
|
166 @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
|
|
167 add r8, r4, r5
|
|
168 add r8, r8, #(1<<10)
|
|
169 mov r8, r8, asr #11
|
|
170 strh r8, [lr, # 4]
|
2967
|
171
|
2979
|
172 @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
|
|
173 sub r8, r4, r5
|
|
174 add r8, r8, #(1<<10)
|
|
175 mov r8, r8, asr #11
|
|
176 strh r8, [lr, #10]
|
2967
|
177
|
2979
|
178 @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
|
|
179 add r8, r2, r7
|
|
180 add r8, r8, #(1<<10)
|
|
181 mov r8, r8, asr #11
|
|
182 strh r8, [lr, # 6]
|
2967
|
183
|
2979
|
184 @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
|
|
185 sub r8, r2, r7
|
|
186 add r8, r8, #(1<<10)
|
|
187 mov r8, r8, asr #11
|
|
188 strh r8, [lr, # 8]
|
61
|
189
|
2979
|
190 @ End of row loop
|
|
191 add lr, lr, #16
|
|
192 subs r12, r12, #1
|
|
193 bne row_loop
|
|
194 beq start_column_loop
|
2967
|
195
|
61
|
196 empty_row:
|
2979
|
197 ldr r1, [r11, #FIX_0xFFFF_ID]
|
|
198 mov r0, r0, lsl #2
|
|
199 and r0, r0, r1
|
|
200 add r0, r0, r0, lsl #16
|
|
201 str r0, [lr, # 0]
|
|
202 str r0, [lr, # 4]
|
|
203 str r0, [lr, # 8]
|
|
204 str r0, [lr, #12]
|
61
|
205
|
|
206 end_of_row_loop:
|
2979
|
207 @ End of loop
|
|
208 add lr, lr, #16
|
|
209 subs r12, r12, #1
|
|
210 bne row_loop
|
61
|
211
|
|
212 start_column_loop:
|
2979
|
213 @ Start of column loop
|
|
214 ldr lr, [ sp ]
|
|
215 mov r12, #8
|
61
|
216 column_loop:
|
2979
|
217 ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0'
|
|
218 ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2'
|
|
219 ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4'
|
|
220 ldrsh r6, [lr, #(12*8)] @ r6 = 'd6'
|
61
|
221
|
2979
|
222 ldr r3, [r11, #FIX_0_541196100_ID]
|
|
223 add r1, r2, r6
|
|
224 ldr r5, [r11, #FIX_M_1_847759065_ID]
|
|
225 mul r1, r3, r1 @ r1 = z1
|
|
226 ldr r3, [r11, #FIX_0_765366865_ID]
|
|
227 mla r6, r5, r6, r1 @ r6 = tmp2
|
|
228 add r5, r0, r4 @ r5 = tmp0
|
|
229 mla r2, r3, r2, r1 @ r2 = tmp3
|
|
230 sub r3, r0, r4 @ r3 = tmp1
|
61
|
231
|
2979
|
232 add r0, r2, r5, lsl #13 @ r0 = tmp10
|
|
233 rsb r2, r2, r5, lsl #13 @ r2 = tmp13
|
|
234 add r4, r6, r3, lsl #13 @ r4 = tmp11
|
|
235 rsb r6, r6, r3, lsl #13 @ r6 = tmp12
|
61
|
236
|
2979
|
237 ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1'
|
|
238 ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3'
|
|
239 ldrsh r5, [lr, #(10*8)] @ r5 = 'd5'
|
|
240 ldrsh r7, [lr, #(14*8)] @ r7 = 'd7'
|
61
|
241
|
2979
|
242 @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
|
|
243 orr r9, r1, r3
|
|
244 orr r10, r5, r7
|
|
245 orrs r10, r9, r10
|
|
246 beq empty_odd_column
|
61
|
247
|
2979
|
248 stmdb sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11
|
2967
|
249
|
2979
|
250 add r0, r3, r5 @ r0 = 'z2'
|
|
251 add r2, r1, r7 @ r2 = 'z1'
|
|
252 add r4, r3, r7 @ r4 = 'z3'
|
|
253 add r6, r1, r5 @ r6 = 'z4'
|
|
254 ldr r9, [r11, #FIX_1_175875602_ID]
|
|
255 add r8, r4, r6
|
|
256 ldr r10, [r11, #FIX_M_0_899976223_ID]
|
|
257 mul r8, r9, r8 @ r8 = 'z5'
|
|
258 ldr r9, [r11, #FIX_M_2_562915447_ID]
|
|
259 mul r2, r10, r2 @ r2 = 'z1'
|
|
260 ldr r10, [r11, #FIX_M_1_961570560_ID]
|
|
261 mul r0, r9, r0 @ r0 = 'z2'
|
|
262 ldr r9, [r11, #FIX_M_0_390180644_ID]
|
|
263 mla r4, r10, r4, r8 @ r4 = 'z3'
|
|
264 ldr r10, [r11, #FIX_0_298631336_ID]
|
|
265 mla r6, r9, r6, r8 @ r6 = 'z4'
|
|
266 ldr r9, [r11, #FIX_2_053119869_ID]
|
|
267 mla r7, r10, r7, r2 @ r7 = tmp0 + z1
|
|
268 ldr r10, [r11, #FIX_3_072711026_ID]
|
|
269 mla r5, r9, r5, r0 @ r5 = tmp1 + z2
|
|
270 ldr r9, [r11, #FIX_1_501321110_ID]
|
|
271 mla r3, r10, r3, r0 @ r3 = tmp2 + z2
|
|
272 add r7, r7, r4 @ r7 = tmp0
|
|
273 mla r1, r9, r1, r2 @ r1 = tmp3 + z1
|
|
274 add r5, r5, r6 @ r5 = tmp1
|
|
275 add r3, r3, r4 @ r3 = tmp2
|
|
276 add r1, r1, r6 @ r1 = tmp3
|
2967
|
277
|
2979
|
278 ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
|
|
279 @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
|
61
|
280
|
2979
|
281 @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
|
|
282 add r8, r0, r1
|
|
283 add r8, r8, #(1<<17)
|
|
284 mov r8, r8, asr #18
|
|
285 strh r8, [lr, #( 0*8)]
|
2967
|
286
|
2979
|
287 @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
|
|
288 sub r8, r0, r1
|
|
289 add r8, r8, #(1<<17)
|
|
290 mov r8, r8, asr #18
|
|
291 strh r8, [lr, #(14*8)]
|
2967
|
292
|
2979
|
293 @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
|
|
294 add r8, r4, r3
|
|
295 add r8, r8, #(1<<17)
|
|
296 mov r8, r8, asr #18
|
|
297 strh r8, [lr, #( 2*8)]
|
2967
|
298
|
2979
|
299 @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
|
|
300 sub r8, r4, r3
|
|
301 add r8, r8, #(1<<17)
|
|
302 mov r8, r8, asr #18
|
|
303 strh r8, [lr, #(12*8)]
|
2967
|
304
|
2979
|
305 @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
|
|
306 add r8, r6, r5
|
|
307 add r8, r8, #(1<<17)
|
|
308 mov r8, r8, asr #18
|
|
309 strh r8, [lr, #( 4*8)]
|
2967
|
310
|
2979
|
311 @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
|
|
312 sub r8, r6, r5
|
|
313 add r8, r8, #(1<<17)
|
|
314 mov r8, r8, asr #18
|
|
315 strh r8, [lr, #(10*8)]
|
2967
|
316
|
2979
|
317 @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
|
|
318 add r8, r2, r7
|
|
319 add r8, r8, #(1<<17)
|
|
320 mov r8, r8, asr #18
|
|
321 strh r8, [lr, #( 6*8)]
|
2967
|
322
|
2979
|
323 @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
|
|
324 sub r8, r2, r7
|
|
325 add r8, r8, #(1<<17)
|
|
326 mov r8, r8, asr #18
|
|
327 strh r8, [lr, #( 8*8)]
|
61
|
328
|
2979
|
329 @ End of row loop
|
|
330 add lr, lr, #2
|
|
331 subs r12, r12, #1
|
|
332 bne column_loop
|
|
333 beq the_end
|
2967
|
334
|
61
|
335 empty_odd_column:
|
2979
|
336 @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
|
|
337 @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
|
|
338 add r0, r0, #(1<<17)
|
|
339 mov r0, r0, asr #18
|
|
340 strh r0, [lr, #( 0*8)]
|
|
341 strh r0, [lr, #(14*8)]
|
2967
|
342
|
2979
|
343 @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
|
|
344 @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
|
|
345 add r4, r4, #(1<<17)
|
|
346 mov r4, r4, asr #18
|
|
347 strh r4, [lr, #( 2*8)]
|
|
348 strh r4, [lr, #(12*8)]
|
2967
|
349
|
2979
|
350 @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
|
|
351 @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
|
|
352 add r6, r6, #(1<<17)
|
|
353 mov r6, r6, asr #18
|
|
354 strh r6, [lr, #( 4*8)]
|
|
355 strh r6, [lr, #(10*8)]
|
2967
|
356
|
2979
|
357 @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
|
|
358 @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
|
|
359 add r2, r2, #(1<<17)
|
|
360 mov r2, r2, asr #18
|
|
361 strh r2, [lr, #( 6*8)]
|
|
362 strh r2, [lr, #( 8*8)]
|
61
|
363
|
2979
|
364 @ End of row loop
|
|
365 add lr, lr, #2
|
|
366 subs r12, r12, #1
|
|
367 bne column_loop
|
2967
|
368
|
|
369 the_end:
|
2979
|
370 @ The end....
|
|
371 add sp, sp, #4
|
|
372 ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return
|
61
|
373
|
|
374 const_array:
|
2979
|
375 .align
|
|
376 .word FIX_0_298631336
|
|
377 .word FIX_0_541196100
|
|
378 .word FIX_0_765366865
|
|
379 .word FIX_1_175875602
|
|
380 .word FIX_1_501321110
|
|
381 .word FIX_2_053119869
|
|
382 .word FIX_3_072711026
|
|
383 .word FIX_M_0_390180644
|
|
384 .word FIX_M_0_899976223
|
|
385 .word FIX_M_1_847759065
|
|
386 .word FIX_M_1_961570560
|
|
387 .word FIX_M_2_562915447
|
|
388 .word FIX_0xFFFF
|