comparison armv4l/simple_idct_arm.S @ 2979:bfabfdf9ce55 libavcodec

COSMETICS: tabs --> spaces, some prettyprinting
author diego
date Thu, 22 Dec 2005 01:10:11 +0000
parents ef2149182f1c
children 0b546eab515d
comparison
equal deleted inserted replaced
2978:403183bbb505 2979:bfabfdf9ce55
49 #define COL_SHIFT 20 49 #define COL_SHIFT 20
50 #define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */ 50 #define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
51 #define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */ 51 #define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
52 52
53 53
54 .text 54 .text
55 .align 55 .align
56 .global simple_idct_ARM 56 .global simple_idct_ARM
57 57
58 simple_idct_ARM: 58 simple_idct_ARM:
59 @@ void simple_idct_ARM(int16_t *block) 59 @@ void simple_idct_ARM(int16_t *block)
60 @@ save stack for reg needed (take all of them), 60 @@ save stack for reg needed (take all of them),
61 @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block 61 @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
118 ldr r10, [r12, #offW5] @ R10=W5 118 ldr r10, [r12, #offW5] @ R10=W5
119 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 119 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
120 ldr r11, [r12, #offW7] @ R11=W7 120 ldr r11, [r12, #offW7] @ R11=W7
121 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 121 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
122 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 122 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
123 teq r2, #0 @ if null avoid muls 123 teq r2, #0 @ if null avoid muls
124 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 124 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
125 rsbne r2, r2, #0 @ R2=-ROWr16[3] 125 rsbne r2, r2, #0 @ R2=-ROWr16[3]
126 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 126 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
127 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 127 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
128 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 128 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
129 129
145 @@ MAC16(b0, W7, row[7]); 145 @@ MAC16(b0, W7, row[7]);
146 @@ MAC16(b2, W3, row[7]); 146 @@ MAC16(b2, W3, row[7]);
147 @@ MAC16(b3, -W1, row[7]); 147 @@ MAC16(b3, -W1, row[7]);
148 @@ MAC16(b1, -W5, row[7]); 148 @@ MAC16(b1, -W5, row[7]);
149 mov r3, r3, asr #16 @ R3=ROWr16[5] 149 mov r3, r3, asr #16 @ R3=ROWr16[5]
150 teq r3, #0 @ if null avoid muls 150 teq r3, #0 @ if null avoid muls
151 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 151 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
152 mov r4, r4, asr #16 @ R4=ROWr16[7] 152 mov r4, r4, asr #16 @ R4=ROWr16[7]
153 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 153 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
154 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 154 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
155 rsbne r3, r3, #0 @ R3=-ROWr16[5] 155 rsbne r3, r3, #0 @ R3=-ROWr16[5]
156 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 156 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
157 @@ R3 is free now 157 @@ R3 is free now
158 teq r4, #0 @ if null avoid muls 158 teq r4, #0 @ if null avoid muls
159 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 159 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
160 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 160 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
161 rsbne r4, r4, #0 @ R4=-ROWr16[7] 161 rsbne r4, r4, #0 @ R4=-ROWr16[7]
162 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 162 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
163 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 163 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
185 @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; 185 @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
186 @@ if (temp != 0) {} 186 @@ if (temp != 0) {}
187 teq r2, #0 187 teq r2, #0
188 beq __end_bef_a_evaluation 188 beq __end_bef_a_evaluation
189 189
190 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) 190 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
191 mul r11, r8, r4 @ R11=W2*ROWr16[2] 191 mul r11, r8, r4 @ R11=W2*ROWr16[2]
192 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) 192 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
193 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) 193 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
194 194
195 195
201 @@ a0 += W4*row[4] 201 @@ a0 += W4*row[4]
202 @@ a1 -= W4*row[4] 202 @@ a1 -= W4*row[4]
203 @@ a2 -= W4*row[4] 203 @@ a2 -= W4*row[4]
204 @@ a3 += W4*row[4] 204 @@ a3 += W4*row[4]
205 ldrsh r11, [r14, #8] @ R11=ROWr16[4] 205 ldrsh r11, [r14, #8] @ R11=ROWr16[4]
206 teq r11, #0 @ if null avoid muls 206 teq r11, #0 @ if null avoid muls
207 mulne r11, r9, r11 @ R11=W4*ROWr16[4] 207 mulne r11, r9, r11 @ R11=W4*ROWr16[4]
208 @@ R9 is free now 208 @@ R9 is free now
209 ldrsh r9, [r14, #12] @ R9=ROWr16[6] 209 ldrsh r9, [r14, #12] @ R9=ROWr16[6]
210 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) 210 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
211 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) 211 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
212 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) 212 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
213 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) 213 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
214 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead 214 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
215 teq r9, #0 @ if null avoid muls 215 teq r9, #0 @ if null avoid muls
216 mulne r11, r10, r9 @ R11=W6*ROWr16[6] 216 mulne r11, r10, r9 @ R11=W6*ROWr16[6]
217 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) 217 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
218 mulne r10, r8, r9 @ R10=W2*ROWr16[6] 218 mulne r10, r8, r9 @ R10=W2*ROWr16[6]
219 @@ a0 += W6*row[6]; 219 @@ a0 += W6*row[6];
220 @@ a3 -= W6*row[6]; 220 @@ a3 -= W6*row[6];
292 sub r14, r14, #16 292 sub r14, r14, #16
293 bne __row_loop 293 bne __row_loop
294 294
295 295
296 296
297 @@ at this point, R0=block, R1-R11 (free) 297 @@ at this point, R0=block, R1-R11 (free)
298 @@ R12=__const_ptr_, R14=&block[n] 298 @@ R12=__const_ptr_, R14=&block[n]
299 add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block. 299 add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
300 __col_loop: 300 __col_loop:
301 301
302 __b_evaluation2: 302 __b_evaluation2:
303 @@ at this point, R0=block (temp), R1-R11 (free) 303 @@ at this point, R0=block (temp), R1-R11 (free)
304 @@ R12=__const_ptr_, R14=&block[n] 304 @@ R12=__const_ptr_, R14=&block[n]
305 @@ proceed with b0-b3 first, followed by a0-a3 305 @@ proceed with b0-b3 first, followed by a0-a3
306 @@ MUL16(b0, W1, col[8x1]); 306 @@ MUL16(b0, W1, col[8x1]);
307 @@ MUL16(b1, W3, col[8x1]); 307 @@ MUL16(b1, W3, col[8x1]);
308 @@ MUL16(b2, W5, col[8x1]); 308 @@ MUL16(b2, W5, col[8x1]);
309 @@ MUL16(b3, W7, col[8x1]); 309 @@ MUL16(b3, W7, col[8x1]);
310 @@ MAC16(b0, W3, col[8x3]); 310 @@ MAC16(b0, W3, col[8x3]);
311 @@ MAC16(b1, -W7, col[8x3]); 311 @@ MAC16(b1, -W7, col[8x3]);
312 @@ MAC16(b2, -W1, col[8x3]); 312 @@ MAC16(b2, -W1, col[8x3]);
313 @@ MAC16(b3, -W5, col[8x3]); 313 @@ MAC16(b3, -W5, col[8x3]);
314 ldr r8, [r12, #offW1] @ R8=W1 314 ldr r8, [r12, #offW1] @ R8=W1
315 ldrsh r7, [r14, #16] 315 ldrsh r7, [r14, #16]
316 mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 316 mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
317 ldr r9, [r12, #offW3] @ R9=W3 317 ldr r9, [r12, #offW3] @ R9=W3
318 ldr r10, [r12, #offW5] @ R10=W5 318 ldr r10, [r12, #offW5] @ R10=W5
319 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 319 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
320 ldr r11, [r12, #offW7] @ R11=W7 320 ldr r11, [r12, #offW7] @ R11=W7
321 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 321 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
322 ldrsh r2, [r14, #48] 322 ldrsh r2, [r14, #48]
323 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 323 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
324 teq r2, #0 @ if 0, then avoid muls 324 teq r2, #0 @ if 0, then avoid muls
325 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 325 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
326 rsbne r2, r2, #0 @ R2=-ROWr16[3] 326 rsbne r2, r2, #0 @ R2=-ROWr16[3]
327 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 327 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
328 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 328 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
329 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 329 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
330 330
331 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), 331 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
332 @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, 332 @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
333 @@ R12=__const_ptr_, R14=&block[n] 333 @@ R12=__const_ptr_, R14=&block[n]
334 @@ MAC16(b0, W5, col[5x8]); 334 @@ MAC16(b0, W5, col[5x8]);
335 @@ MAC16(b2, W7, col[5x8]); 335 @@ MAC16(b2, W7, col[5x8]);
336 @@ MAC16(b3, W3, col[5x8]); 336 @@ MAC16(b3, W3, col[5x8]);
337 @@ MAC16(b1, -W1, col[5x8]); 337 @@ MAC16(b1, -W1, col[5x8]);
338 @@ MAC16(b0, W7, col[7x8]); 338 @@ MAC16(b0, W7, col[7x8]);
339 @@ MAC16(b2, W3, col[7x8]); 339 @@ MAC16(b2, W3, col[7x8]);
340 @@ MAC16(b3, -W1, col[7x8]); 340 @@ MAC16(b3, -W1, col[7x8]);
341 @@ MAC16(b1, -W5, col[7x8]); 341 @@ MAC16(b1, -W5, col[7x8]);
342 ldrsh r3, [r14, #80] @ R3=COLr16[5x8] 342 ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
343 teq r3, #0 @ if 0 then avoid muls 343 teq r3, #0 @ if 0 then avoid muls
344 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 344 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
345 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 345 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
346 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 346 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
347 rsbne r3, r3, #0 @ R3=-ROWr16[5x8] 347 rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
348 ldrsh r4, [r14, #112] @ R4=COLr16[7x8] 348 ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
349 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 349 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
350 @@ R3 is free now 350 @@ R3 is free now
351 teq r4, #0 @ if 0 then avoid muls 351 teq r4, #0 @ if 0 then avoid muls
352 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 352 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
353 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 353 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
354 rsbne r4, r4, #0 @ R4=-ROWr16[7x8] 354 rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
355 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 355 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
356 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 356 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
357 @@ R4 is free now 357 @@ R4 is free now
358 __end_b_evaluation2: 358 __end_b_evaluation2:
359 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), 359 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
360 @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), 360 @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
361 @@ R12=__const_ptr_, R14=&block[n] 361 @@ R12=__const_ptr_, R14=&block[n]
362 362
363 __a_evaluation2: 363 __a_evaluation2:
364 @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1)); 364 @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
365 @@ a1 = a0 + W6 * row[2]; 365 @@ a1 = a0 + W6 * row[2];
366 @@ a2 = a0 - W6 * row[2]; 366 @@ a2 = a0 - W6 * row[2];
367 @@ a3 = a0 - W2 * row[2]; 367 @@ a3 = a0 - W2 * row[2];
368 @@ a0 = a0 + W2 * row[2]; 368 @@ a0 = a0 + W2 * row[2];
369 ldrsh r6, [r14, #0] 369 ldrsh r6, [r14, #0]
370 ldr r9, [r12, #offW4] @ R9=W4 370 ldr r9, [r12, #offW4] @ R9=W4
371 mul r6, r9, r6 @ R6=W4*ROWr16[0] 371 mul r6, r9, r6 @ R6=W4*ROWr16[0]
372 ldr r10, [r12, #offW6] @ R10=W6 372 ldr r10, [r12, #offW6] @ R10=W6
373 ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet) 373 ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet)
374 add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0) 374 add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
375 mul r11, r10, r4 @ R11=W6*ROWr16[2] 375 mul r11, r10, r4 @ R11=W6*ROWr16[2]
376 ldr r8, [r12, #offW2] @ R8=W2 376 ldr r8, [r12, #offW2] @ R8=W2
377 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) 377 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
378 sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) 378 sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
379 mul r11, r8, r4 @ R11=W2*ROWr16[2] 379 mul r11, r8, r4 @ R11=W2*ROWr16[2]
380 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) 380 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
381 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) 381 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
382 382
383 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, 383 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
384 @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), 384 @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
385 @@ R12=__const_ptr_, R14=&block[n] 385 @@ R12=__const_ptr_, R14=&block[n]
386 @@ a0 += W4*row[4] 386 @@ a0 += W4*row[4]
387 @@ a1 -= W4*row[4] 387 @@ a1 -= W4*row[4]
388 @@ a2 -= W4*row[4] 388 @@ a2 -= W4*row[4]
389 @@ a3 += W4*row[4] 389 @@ a3 += W4*row[4]
390 ldrsh r11, [r14, #64] @ R11=ROWr16[4] 390 ldrsh r11, [r14, #64] @ R11=ROWr16[4]
391 teq r11, #0 @ if null avoid muls 391 teq r11, #0 @ if null avoid muls
392 mulne r11, r9, r11 @ R11=W4*ROWr16[4] 392 mulne r11, r9, r11 @ R11=W4*ROWr16[4]
393 @@ R9 is free now 393 @@ R9 is free now
394 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) 394 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
395 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) 395 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
396 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) 396 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
397 ldrsh r9, [r14, #96] @ R9=ROWr16[6] 397 ldrsh r9, [r14, #96] @ R9=ROWr16[6]
398 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) 398 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
399 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead 399 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
400 teq r9, #0 @ if null avoid muls 400 teq r9, #0 @ if null avoid muls
401 mulne r11, r10, r9 @ R11=W6*ROWr16[6] 401 mulne r11, r10, r9 @ R11=W6*ROWr16[6]
402 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) 402 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
403 mulne r10, r8, r9 @ R10=W2*ROWr16[6] 403 mulne r10, r8, r9 @ R10=W2*ROWr16[6]
404 @@ a0 += W6*row[6]; 404 @@ a0 += W6*row[6];
405 @@ a3 -= W6*row[6]; 405 @@ a3 -= W6*row[6];
406 @@ a1 -= W2*row[6]; 406 @@ a1 -= W2*row[6];
407 @@ a2 += W2*row[6]; 407 @@ a2 += W2*row[6];
408 subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) 408 subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
409 subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) 409 subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
410 addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) 410 addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
411 __end_a_evaluation2: 411 __end_a_evaluation2:
412 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, 412 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
413 @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), 413 @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
414 @@ R12=__const_ptr_, R14=&block[n] 414 @@ R12=__const_ptr_, R14=&block[n]
415 @@ col[0 ] = ((a0 + b0) >> COL_SHIFT); 415 @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
416 @@ col[8 ] = ((a1 + b1) >> COL_SHIFT); 416 @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
417 @@ col[16] = ((a2 + b2) >> COL_SHIFT); 417 @@ col[16] = ((a2 + b2) >> COL_SHIFT);
418 @@ col[24] = ((a3 + b3) >> COL_SHIFT); 418 @@ col[24] = ((a3 + b3) >> COL_SHIFT);
419 @@ col[32] = ((a3 - b3) >> COL_SHIFT); 419 @@ col[32] = ((a3 - b3) >> COL_SHIFT);
420 @@ col[40] = ((a2 - b2) >> COL_SHIFT); 420 @@ col[40] = ((a2 - b2) >> COL_SHIFT);
421 @@ col[48] = ((a1 - b1) >> COL_SHIFT); 421 @@ col[48] = ((a1 - b1) >> COL_SHIFT);
422 @@ col[56] = ((a0 - b0) >> COL_SHIFT); 422 @@ col[56] = ((a0 - b0) >> COL_SHIFT);
423 @@@@@ no optimisation here @@@@@ 423 @@@@@ no optimisation here @@@@@
424 add r8, r6, r0 @ R8=a0+b0 424 add r8, r6, r0 @ R8=a0+b0
425 add r9, r2, r1 @ R9=a1+b1 425 add r9, r2, r1 @ R9=a1+b1
426 mov r8, r8, asr #COL_SHIFT 426 mov r8, r8, asr #COL_SHIFT
427 mov r9, r9, asr #COL_SHIFT 427 mov r9, r9, asr #COL_SHIFT
428 strh r8, [r14, #0] 428 strh r8, [r14, #0]
429 strh r9, [r14, #16] 429 strh r9, [r14, #16]
430 add r8, r3, r5 @ R8=a2+b2 430 add r8, r3, r5 @ R8=a2+b2
431 add r9, r4, r7 @ R9=a3+b3 431 add r9, r4, r7 @ R9=a3+b3
432 mov r8, r8, asr #COL_SHIFT 432 mov r8, r8, asr #COL_SHIFT
433 mov r9, r9, asr #COL_SHIFT 433 mov r9, r9, asr #COL_SHIFT
434 strh r8, [r14, #32] 434 strh r8, [r14, #32]
435 strh r9, [r14, #48] 435 strh r9, [r14, #48]
436 sub r8, r4, r7 @ R8=a3-b3 436 sub r8, r4, r7 @ R8=a3-b3
437 sub r9, r3, r5 @ R9=a2-b2 437 sub r9, r3, r5 @ R9=a2-b2
438 mov r8, r8, asr #COL_SHIFT 438 mov r8, r8, asr #COL_SHIFT
439 mov r9, r9, asr #COL_SHIFT 439 mov r9, r9, asr #COL_SHIFT
440 strh r8, [r14, #64] 440 strh r8, [r14, #64]
441 strh r9, [r14, #80] 441 strh r9, [r14, #80]
442 sub r8, r2, r1 @ R8=a1-b1 442 sub r8, r2, r1 @ R8=a1-b1
443 sub r9, r6, r0 @ R9=a0-b0 443 sub r9, r6, r0 @ R9=a0-b0
444 mov r8, r8, asr #COL_SHIFT 444 mov r8, r8, asr #COL_SHIFT
445 mov r9, r9, asr #COL_SHIFT 445 mov r9, r9, asr #COL_SHIFT
446 strh r8, [r14, #96] 446 strh r8, [r14, #96]
447 strh r9, [r14, #112] 447 strh r9, [r14, #112]
448 448
449 __end_col_loop: 449 __end_col_loop:
450 @@ at this point, R0-R11 (free) 450 @@ at this point, R0-R11 (free)
451 @@ R12=__const_ptr_, R14=&block[n] 451 @@ R12=__const_ptr_, R14=&block[n]
452 ldr r0, [sp, #0] @ R0=block 452 ldr r0, [sp, #0] @ R0=block
453 teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished. 453 teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished.
454 sub r14, r14, #2 454 sub r14, r14, #2
455 bne __col_loop 455 bne __col_loop
456 456
457 457
458 458
459 459
460 __end_simple_idct_ARM: 460 __end_simple_idct_ARM:
464 464
465 465
466 466
467 @@ kind of sub-function, here not to overload the common case. 467 @@ kind of sub-function, here not to overload the common case.
468 __end_bef_a_evaluation: 468 __end_bef_a_evaluation:
469 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) 469 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
470 mul r11, r8, r4 @ R11=W2*ROWr16[2] 470 mul r11, r8, r4 @ R11=W2*ROWr16[2]
471 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) 471 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
472 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) 472 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
473 bal __end_a_evaluation 473 bal __end_a_evaluation
474 474
475 475
476 __constant_ptr__: @@ see #defines at the beginning of the source code for values. 476 __constant_ptr__: @@ see #defines at the beginning of the source code for values.
477 .align 477 .align
478 .word W1 478 .word W1
479 .word W2 479 .word W2
480 .word W3 480 .word W3
481 .word W4 481 .word W4
482 .word W5 482 .word W5