comparison arm/simple_idct_arm.S @ 8359:9281a8a9387a libavcodec

ARM: replace "armv4l" with "arm"
author mru
date Wed, 17 Dec 2008 00:54:54 +0000
parents armv4l/simple_idct_arm.S@316762ae96a7
children 04423b2f6e0b
comparison
equal deleted inserted replaced
8358:c30b92cf446b 8359:9281a8a9387a
1 /*
2 * simple_idct_arm.S
3 * Copyright (C) 2002 Frederic 'dilb' Boulay.
4 *
5 * Author: Frederic Boulay <dilb@handhelds.org>
6 *
7 * The function defined in this file is derived from the simple_idct function
8 * from the libavcodec library part of the FFmpeg project.
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27 #include "asm.S"
28
29 /* useful constants for the algorithm, they are save in __constant_ptr__ at */
30 /* the end of the source code.*/
31 #define W1 22725
32 #define W2 21407
33 #define W3 19266
34 #define W4 16383
35 #define W5 12873
36 #define W6 8867
37 #define W7 4520
38 #define MASK_MSHW 0xFFFF0000
39
40 /* offsets of the constants in the vector */
41 #define offW1 0
42 #define offW2 4
43 #define offW3 8
44 #define offW4 12
45 #define offW5 16
46 #define offW6 20
47 #define offW7 24
48 #define offMASK_MSHW 28
49
50 #define ROW_SHIFT 11
51 #define ROW_SHIFT2MSHW (16-11)
52 #define COL_SHIFT 20
53 #define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
54 #define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
55
56
57 .text
58
59 function simple_idct_ARM, export=1
60 @@ void simple_idct_ARM(int16_t *block)
61 @@ save stack for reg needed (take all of them),
62 @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
63 @@ so it must not be overwritten, if it is not saved!!
64 @@ R12 is another scratch register, so it should not be saved too
65 @@ save all registers
66 stmfd sp!, {r4-r11, r14} @ R14 is also called LR
67 @@ at this point, R0=block, other registers are free.
68 add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
69 add r12, pc, #(__constant_ptr__-.-8) @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
70 @@ add 2 temporary variables in the stack: R0 and R14
71 sub sp, sp, #8 @ allow 2 local variables
72 str r0, [sp, #0] @ save block in sp[0]
73 @@ stack status
74 @@ sp+4 free
75 @@ sp+0 R0 (block)
76
77
78 @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
79
80
81 __row_loop:
82 @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
83 ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
84 ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1]
85 ldr r3, [r14, #8] @ R3=ROWr32[2]
86 ldr r4, [r14, #12] @ R4=ROWr32[3]
87 @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
88 @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
89 @@ else follow the complete algorithm.
90 @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
91 @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
92 orr r5, r4, r3 @ R5=R4 | R3
93 orr r5, r5, r2 @ R5=R4 | R3 | R2
94 orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null)
95 beq __end_row_loop
96 mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
97 ldrsh r6, [r14, #0] @ R6=ROWr16[0]
98 orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7
99 beq __almost_empty_row
100
101 __b_evaluation:
102 @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
103 @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
104 @@ R12=__const_ptr_, R14=&block[n]
105 @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
106
107 @@ MUL16(b0, W1, row[1]);
108 @@ MUL16(b1, W3, row[1]);
109 @@ MUL16(b2, W5, row[1]);
110 @@ MUL16(b3, W7, row[1]);
111 @@ MAC16(b0, W3, row[3]);
112 @@ MAC16(b1, -W7, row[3]);
113 @@ MAC16(b2, -W1, row[3]);
114 @@ MAC16(b3, -W5, row[3]);
115 ldr r8, [r12, #offW1] @ R8=W1
116 mov r2, r2, asr #16 @ R2=ROWr16[3]
117 mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
118 ldr r9, [r12, #offW3] @ R9=W3
119 ldr r10, [r12, #offW5] @ R10=W5
120 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
121 ldr r11, [r12, #offW7] @ R11=W7
122 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
123 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
124 teq r2, #0 @ if null avoid muls
125 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
126 rsbne r2, r2, #0 @ R2=-ROWr16[3]
127 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
128 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
129 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
130
131 @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
132 @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
133 @@ R12=__const_ptr_, R14=&block[n]
134 @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
135 @@ if (temp != 0) {}
136 orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3]
137 beq __end_b_evaluation
138
139 @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
140 @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
141 @@ R12=__const_ptr_, R14=&block[n]
142 @@ MAC16(b0, W5, row[5]);
143 @@ MAC16(b2, W7, row[5]);
144 @@ MAC16(b3, W3, row[5]);
145 @@ MAC16(b1, -W1, row[5]);
146 @@ MAC16(b0, W7, row[7]);
147 @@ MAC16(b2, W3, row[7]);
148 @@ MAC16(b3, -W1, row[7]);
149 @@ MAC16(b1, -W5, row[7]);
150 mov r3, r3, asr #16 @ R3=ROWr16[5]
151 teq r3, #0 @ if null avoid muls
152 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
153 mov r4, r4, asr #16 @ R4=ROWr16[7]
154 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
155 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
156 rsbne r3, r3, #0 @ R3=-ROWr16[5]
157 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
158 @@ R3 is free now
159 teq r4, #0 @ if null avoid muls
160 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
161 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
162 rsbne r4, r4, #0 @ R4=-ROWr16[7]
163 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
164 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
165 @@ R4 is free now
166 __end_b_evaluation:
167 @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
168 @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
169 @@ R12=__const_ptr_, R14=&block[n]
170
171 __a_evaluation:
172 @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
173 @@ a1 = a0 + W6 * row[2];
174 @@ a2 = a0 - W6 * row[2];
175 @@ a3 = a0 - W2 * row[2];
176 @@ a0 = a0 + W2 * row[2];
177 ldr r9, [r12, #offW4] @ R9=W4
178 mul r6, r9, r6 @ R6=W4*ROWr16[0]
179 ldr r10, [r12, #offW6] @ R10=W6
180 ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet)
181 add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
182
183 mul r11, r10, r4 @ R11=W6*ROWr16[2]
184 ldr r8, [r12, #offW2] @ R8=W2
185 sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
186 @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
187 @@ if (temp != 0) {}
188 teq r2, #0
189 beq __end_bef_a_evaluation
190
191 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
192 mul r11, r8, r4 @ R11=W2*ROWr16[2]
193 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
194 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
195
196
197 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
198 @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
199 @@ R12=__const_ptr_, R14=&block[n]
200
201
202 @@ a0 += W4*row[4]
203 @@ a1 -= W4*row[4]
204 @@ a2 -= W4*row[4]
205 @@ a3 += W4*row[4]
206 ldrsh r11, [r14, #8] @ R11=ROWr16[4]
207 teq r11, #0 @ if null avoid muls
208 mulne r11, r9, r11 @ R11=W4*ROWr16[4]
209 @@ R9 is free now
210 ldrsh r9, [r14, #12] @ R9=ROWr16[6]
211 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
212 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
213 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
214 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
215 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
216 teq r9, #0 @ if null avoid muls
217 mulne r11, r10, r9 @ R11=W6*ROWr16[6]
218 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
219 mulne r10, r8, r9 @ R10=W2*ROWr16[6]
220 @@ a0 += W6*row[6];
221 @@ a3 -= W6*row[6];
222 @@ a1 -= W2*row[6];
223 @@ a2 += W2*row[6];
224 subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
225 subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
226 addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
227
228 __end_a_evaluation:
229 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
230 @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
231 @@ R12=__const_ptr_, R14=&block[n]
232 @@ row[0] = (a0 + b0) >> ROW_SHIFT;
233 @@ row[1] = (a1 + b1) >> ROW_SHIFT;
234 @@ row[2] = (a2 + b2) >> ROW_SHIFT;
235 @@ row[3] = (a3 + b3) >> ROW_SHIFT;
236 @@ row[4] = (a3 - b3) >> ROW_SHIFT;
237 @@ row[5] = (a2 - b2) >> ROW_SHIFT;
238 @@ row[6] = (a1 - b1) >> ROW_SHIFT;
239 @@ row[7] = (a0 - b0) >> ROW_SHIFT;
240 add r8, r6, r0 @ R8=a0+b0
241 add r9, r2, r1 @ R9=a1+b1
242 @@ put 2 16 bits half-words in a 32bits word
243 @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
244 ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
245 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
246 mvn r11, r10 @ R11= NOT R10= 0x0000FFFF
247 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
248 orr r8, r8, r9
249 str r8, [r14, #0]
250
251 add r8, r3, r5 @ R8=a2+b2
252 add r9, r4, r7 @ R9=a3+b3
253 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
254 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
255 orr r8, r8, r9
256 str r8, [r14, #4]
257
258 sub r8, r4, r7 @ R8=a3-b3
259 sub r9, r3, r5 @ R9=a2-b2
260 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
261 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
262 orr r8, r8, r9
263 str r8, [r14, #8]
264
265 sub r8, r2, r1 @ R8=a1-b1
266 sub r9, r6, r0 @ R9=a0-b0
267 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
268 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
269 orr r8, r8, r9
270 str r8, [r14, #12]
271
272 bal __end_row_loop
273
274 __almost_empty_row:
275 @@ the row was empty, except ROWr16[0], now, management of this special case
276 @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
277 @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
278 @@ R8=0xFFFF (temp), R9-R11 free
279 mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
280 sub r8, r8, #1 @ R8 is now ready.
281 and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
282 orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16)
283 str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5
284 str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5
285 str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5
286 str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5
287
288 __end_row_loop:
289 @@ at this point, R0-R11 (free)
290 @@ R12=__const_ptr_, R14=&block[n]
291 ldr r0, [sp, #0] @ R0=block
292 teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished.
293 sub r14, r14, #16
294 bne __row_loop
295
296
297
298 @@ at this point, R0=block, R1-R11 (free)
299 @@ R12=__const_ptr_, R14=&block[n]
300 add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
301 __col_loop:
302
303 __b_evaluation2:
304 @@ at this point, R0=block (temp), R1-R11 (free)
305 @@ R12=__const_ptr_, R14=&block[n]
306 @@ proceed with b0-b3 first, followed by a0-a3
307 @@ MUL16(b0, W1, col[8x1]);
308 @@ MUL16(b1, W3, col[8x1]);
309 @@ MUL16(b2, W5, col[8x1]);
310 @@ MUL16(b3, W7, col[8x1]);
311 @@ MAC16(b0, W3, col[8x3]);
312 @@ MAC16(b1, -W7, col[8x3]);
313 @@ MAC16(b2, -W1, col[8x3]);
314 @@ MAC16(b3, -W5, col[8x3]);
315 ldr r8, [r12, #offW1] @ R8=W1
316 ldrsh r7, [r14, #16]
317 mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
318 ldr r9, [r12, #offW3] @ R9=W3
319 ldr r10, [r12, #offW5] @ R10=W5
320 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
321 ldr r11, [r12, #offW7] @ R11=W7
322 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
323 ldrsh r2, [r14, #48]
324 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
325 teq r2, #0 @ if 0, then avoid muls
326 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
327 rsbne r2, r2, #0 @ R2=-ROWr16[3]
328 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
329 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
330 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
331
332 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
333 @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
334 @@ R12=__const_ptr_, R14=&block[n]
335 @@ MAC16(b0, W5, col[5x8]);
336 @@ MAC16(b2, W7, col[5x8]);
337 @@ MAC16(b3, W3, col[5x8]);
338 @@ MAC16(b1, -W1, col[5x8]);
339 @@ MAC16(b0, W7, col[7x8]);
340 @@ MAC16(b2, W3, col[7x8]);
341 @@ MAC16(b3, -W1, col[7x8]);
342 @@ MAC16(b1, -W5, col[7x8]);
343 ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
344 teq r3, #0 @ if 0 then avoid muls
345 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
346 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
347 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
348 rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
349 ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
350 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
351 @@ R3 is free now
352 teq r4, #0 @ if 0 then avoid muls
353 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
354 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
355 rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
356 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
357 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
358 @@ R4 is free now
359 __end_b_evaluation2:
360 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
361 @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
362 @@ R12=__const_ptr_, R14=&block[n]
363
364 __a_evaluation2:
365 @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
366 @@ a1 = a0 + W6 * row[2];
367 @@ a2 = a0 - W6 * row[2];
368 @@ a3 = a0 - W2 * row[2];
369 @@ a0 = a0 + W2 * row[2];
370 ldrsh r6, [r14, #0]
371 ldr r9, [r12, #offW4] @ R9=W4
372 mul r6, r9, r6 @ R6=W4*ROWr16[0]
373 ldr r10, [r12, #offW6] @ R10=W6
374 ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet)
375 add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
376 mul r11, r10, r4 @ R11=W6*ROWr16[2]
377 ldr r8, [r12, #offW2] @ R8=W2
378 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
379 sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
380 mul r11, r8, r4 @ R11=W2*ROWr16[2]
381 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
382 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
383
384 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
385 @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
386 @@ R12=__const_ptr_, R14=&block[n]
387 @@ a0 += W4*row[4]
388 @@ a1 -= W4*row[4]
389 @@ a2 -= W4*row[4]
390 @@ a3 += W4*row[4]
391 ldrsh r11, [r14, #64] @ R11=ROWr16[4]
392 teq r11, #0 @ if null avoid muls
393 mulne r11, r9, r11 @ R11=W4*ROWr16[4]
394 @@ R9 is free now
395 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
396 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
397 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
398 ldrsh r9, [r14, #96] @ R9=ROWr16[6]
399 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
400 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
401 teq r9, #0 @ if null avoid muls
402 mulne r11, r10, r9 @ R11=W6*ROWr16[6]
403 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
404 mulne r10, r8, r9 @ R10=W2*ROWr16[6]
405 @@ a0 += W6*row[6];
406 @@ a3 -= W6*row[6];
407 @@ a1 -= W2*row[6];
408 @@ a2 += W2*row[6];
409 subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
410 subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
411 addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
412 __end_a_evaluation2:
413 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
414 @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
415 @@ R12=__const_ptr_, R14=&block[n]
416 @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
417 @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
418 @@ col[16] = ((a2 + b2) >> COL_SHIFT);
419 @@ col[24] = ((a3 + b3) >> COL_SHIFT);
420 @@ col[32] = ((a3 - b3) >> COL_SHIFT);
421 @@ col[40] = ((a2 - b2) >> COL_SHIFT);
422 @@ col[48] = ((a1 - b1) >> COL_SHIFT);
423 @@ col[56] = ((a0 - b0) >> COL_SHIFT);
424 @@@@@ no optimization here @@@@@
425 add r8, r6, r0 @ R8=a0+b0
426 add r9, r2, r1 @ R9=a1+b1
427 mov r8, r8, asr #COL_SHIFT
428 mov r9, r9, asr #COL_SHIFT
429 strh r8, [r14, #0]
430 strh r9, [r14, #16]
431 add r8, r3, r5 @ R8=a2+b2
432 add r9, r4, r7 @ R9=a3+b3
433 mov r8, r8, asr #COL_SHIFT
434 mov r9, r9, asr #COL_SHIFT
435 strh r8, [r14, #32]
436 strh r9, [r14, #48]
437 sub r8, r4, r7 @ R8=a3-b3
438 sub r9, r3, r5 @ R9=a2-b2
439 mov r8, r8, asr #COL_SHIFT
440 mov r9, r9, asr #COL_SHIFT
441 strh r8, [r14, #64]
442 strh r9, [r14, #80]
443 sub r8, r2, r1 @ R8=a1-b1
444 sub r9, r6, r0 @ R9=a0-b0
445 mov r8, r8, asr #COL_SHIFT
446 mov r9, r9, asr #COL_SHIFT
447 strh r8, [r14, #96]
448 strh r9, [r14, #112]
449
450 __end_col_loop:
451 @@ at this point, R0-R11 (free)
452 @@ R12=__const_ptr_, R14=&block[n]
453 ldr r0, [sp, #0] @ R0=block
454 teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished.
455 sub r14, r14, #2
456 bne __col_loop
457
458
459
460
461 __end_simple_idct_ARM:
462 @@ restore registers to previous status!
463 add sp, sp, #8 @@ the local variables!
464 ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
465
466
467
468 @@ kind of sub-function, here not to overload the common case.
469 __end_bef_a_evaluation:
470 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
471 mul r11, r8, r4 @ R11=W2*ROWr16[2]
472 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
473 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
474 bal __end_a_evaluation
475
476
477 __constant_ptr__: @@ see #defines at the beginning of the source code for values.
478 .align
479 .word W1
480 .word W2
481 .word W3
482 .word W4
483 .word W5
484 .word W6
485 .word W7
486 .word MASK_MSHW