Mercurial > libavcodec.hg
comparison armv4l/simple_idct_arm.S @ 2979:bfabfdf9ce55 libavcodec
COSMETICS: tabs --> spaces, some prettyprinting
author | diego |
---|---|
date | Thu, 22 Dec 2005 01:10:11 +0000 |
parents | ef2149182f1c |
children | 0b546eab515d |
comparison
equal
deleted
inserted
replaced
2978:403183bbb505 | 2979:bfabfdf9ce55 |
---|---|
49 #define COL_SHIFT 20 | 49 #define COL_SHIFT 20 |
50 #define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */ | 50 #define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */ |
51 #define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */ | 51 #define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */ |
52 | 52 |
53 | 53 |
54 .text | 54 .text |
55 .align | 55 .align |
56 .global simple_idct_ARM | 56 .global simple_idct_ARM |
57 | 57 |
58 simple_idct_ARM: | 58 simple_idct_ARM: |
59 @@ void simple_idct_ARM(int16_t *block) | 59 @@ void simple_idct_ARM(int16_t *block) |
60 @@ save stack for reg needed (take all of them), | 60 @@ save stack for reg needed (take all of them), |
61 @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block | 61 @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block |
118 ldr r10, [r12, #offW5] @ R10=W5 | 118 ldr r10, [r12, #offW5] @ R10=W5 |
119 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | 119 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) |
120 ldr r11, [r12, #offW7] @ R11=W7 | 120 ldr r11, [r12, #offW7] @ R11=W7 |
121 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | 121 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) |
122 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | 122 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) |
123 teq r2, #0 @ if null avoid muls | 123 teq r2, #0 @ if null avoid muls |
124 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | 124 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) |
125 rsbne r2, r2, #0 @ R2=-ROWr16[3] | 125 rsbne r2, r2, #0 @ R2=-ROWr16[3] |
126 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | 126 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) |
127 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | 127 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) |
128 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | 128 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) |
129 | 129 |
145 @@ MAC16(b0, W7, row[7]); | 145 @@ MAC16(b0, W7, row[7]); |
146 @@ MAC16(b2, W3, row[7]); | 146 @@ MAC16(b2, W3, row[7]); |
147 @@ MAC16(b3, -W1, row[7]); | 147 @@ MAC16(b3, -W1, row[7]); |
148 @@ MAC16(b1, -W5, row[7]); | 148 @@ MAC16(b1, -W5, row[7]); |
149 mov r3, r3, asr #16 @ R3=ROWr16[5] | 149 mov r3, r3, asr #16 @ R3=ROWr16[5] |
150 teq r3, #0 @ if null avoid muls | 150 teq r3, #0 @ if null avoid muls |
151 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 | 151 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 |
152 mov r4, r4, asr #16 @ R4=ROWr16[7] | 152 mov r4, r4, asr #16 @ R4=ROWr16[7] |
153 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 | 153 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 |
154 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 | 154 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 |
155 rsbne r3, r3, #0 @ R3=-ROWr16[5] | 155 rsbne r3, r3, #0 @ R3=-ROWr16[5] |
156 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 | 156 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 |
157 @@ R3 is free now | 157 @@ R3 is free now |
158 teq r4, #0 @ if null avoid muls | 158 teq r4, #0 @ if null avoid muls |
159 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 | 159 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 |
160 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 | 160 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 |
161 rsbne r4, r4, #0 @ R4=-ROWr16[7] | 161 rsbne r4, r4, #0 @ R4=-ROWr16[7] |
162 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 | 162 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 |
163 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 | 163 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 |
185 @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; | 185 @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; |
186 @@ if (temp != 0) {} | 186 @@ if (temp != 0) {} |
187 teq r2, #0 | 187 teq r2, #0 |
188 beq __end_bef_a_evaluation | 188 beq __end_bef_a_evaluation |
189 | 189 |
190 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) | 190 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) |
191 mul r11, r8, r4 @ R11=W2*ROWr16[2] | 191 mul r11, r8, r4 @ R11=W2*ROWr16[2] |
192 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) | 192 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) |
193 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) | 193 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) |
194 | 194 |
195 | 195 |
201 @@ a0 += W4*row[4] | 201 @@ a0 += W4*row[4] |
202 @@ a1 -= W4*row[4] | 202 @@ a1 -= W4*row[4] |
203 @@ a2 -= W4*row[4] | 203 @@ a2 -= W4*row[4] |
204 @@ a3 += W4*row[4] | 204 @@ a3 += W4*row[4] |
205 ldrsh r11, [r14, #8] @ R11=ROWr16[4] | 205 ldrsh r11, [r14, #8] @ R11=ROWr16[4] |
206 teq r11, #0 @ if null avoid muls | 206 teq r11, #0 @ if null avoid muls |
207 mulne r11, r9, r11 @ R11=W4*ROWr16[4] | 207 mulne r11, r9, r11 @ R11=W4*ROWr16[4] |
208 @@ R9 is free now | 208 @@ R9 is free now |
209 ldrsh r9, [r14, #12] @ R9=ROWr16[6] | 209 ldrsh r9, [r14, #12] @ R9=ROWr16[6] |
210 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) | 210 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) |
211 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) | 211 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) |
212 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) | 212 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) |
213 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) | 213 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) |
214 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead | 214 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead |
215 teq r9, #0 @ if null avoid muls | 215 teq r9, #0 @ if null avoid muls |
216 mulne r11, r10, r9 @ R11=W6*ROWr16[6] | 216 mulne r11, r10, r9 @ R11=W6*ROWr16[6] |
217 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) | 217 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) |
218 mulne r10, r8, r9 @ R10=W2*ROWr16[6] | 218 mulne r10, r8, r9 @ R10=W2*ROWr16[6] |
219 @@ a0 += W6*row[6]; | 219 @@ a0 += W6*row[6]; |
220 @@ a3 -= W6*row[6]; | 220 @@ a3 -= W6*row[6]; |
292 sub r14, r14, #16 | 292 sub r14, r14, #16 |
293 bne __row_loop | 293 bne __row_loop |
294 | 294 |
295 | 295 |
296 | 296 |
297 @@ at this point, R0=block, R1-R11 (free) | 297 @@ at this point, R0=block, R1-R11 (free) |
298 @@ R12=__const_ptr_, R14=&block[n] | 298 @@ R12=__const_ptr_, R14=&block[n] |
299 add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block. | 299 add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block. |
300 __col_loop: | 300 __col_loop: |
301 | 301 |
302 __b_evaluation2: | 302 __b_evaluation2: |
303 @@ at this point, R0=block (temp), R1-R11 (free) | 303 @@ at this point, R0=block (temp), R1-R11 (free) |
304 @@ R12=__const_ptr_, R14=&block[n] | 304 @@ R12=__const_ptr_, R14=&block[n] |
305 @@ proceed with b0-b3 first, followed by a0-a3 | 305 @@ proceed with b0-b3 first, followed by a0-a3 |
306 @@ MUL16(b0, W1, col[8x1]); | 306 @@ MUL16(b0, W1, col[8x1]); |
307 @@ MUL16(b1, W3, col[8x1]); | 307 @@ MUL16(b1, W3, col[8x1]); |
308 @@ MUL16(b2, W5, col[8x1]); | 308 @@ MUL16(b2, W5, col[8x1]); |
309 @@ MUL16(b3, W7, col[8x1]); | 309 @@ MUL16(b3, W7, col[8x1]); |
310 @@ MAC16(b0, W3, col[8x3]); | 310 @@ MAC16(b0, W3, col[8x3]); |
311 @@ MAC16(b1, -W7, col[8x3]); | 311 @@ MAC16(b1, -W7, col[8x3]); |
312 @@ MAC16(b2, -W1, col[8x3]); | 312 @@ MAC16(b2, -W1, col[8x3]); |
313 @@ MAC16(b3, -W5, col[8x3]); | 313 @@ MAC16(b3, -W5, col[8x3]); |
314 ldr r8, [r12, #offW1] @ R8=W1 | 314 ldr r8, [r12, #offW1] @ R8=W1 |
315 ldrsh r7, [r14, #16] | 315 ldrsh r7, [r14, #16] |
316 mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | 316 mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) |
317 ldr r9, [r12, #offW3] @ R9=W3 | 317 ldr r9, [r12, #offW3] @ R9=W3 |
318 ldr r10, [r12, #offW5] @ R10=W5 | 318 ldr r10, [r12, #offW5] @ R10=W5 |
319 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | 319 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) |
320 ldr r11, [r12, #offW7] @ R11=W7 | 320 ldr r11, [r12, #offW7] @ R11=W7 |
321 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | 321 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) |
322 ldrsh r2, [r14, #48] | 322 ldrsh r2, [r14, #48] |
323 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | 323 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) |
324 teq r2, #0 @ if 0, then avoid muls | 324 teq r2, #0 @ if 0, then avoid muls |
325 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | 325 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) |
326 rsbne r2, r2, #0 @ R2=-ROWr16[3] | 326 rsbne r2, r2, #0 @ R2=-ROWr16[3] |
327 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | 327 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) |
328 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | 328 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) |
329 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | 329 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) |
330 | 330 |
331 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), | 331 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), |
332 @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, | 332 @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, |
333 @@ R12=__const_ptr_, R14=&block[n] | 333 @@ R12=__const_ptr_, R14=&block[n] |
334 @@ MAC16(b0, W5, col[5x8]); | 334 @@ MAC16(b0, W5, col[5x8]); |
335 @@ MAC16(b2, W7, col[5x8]); | 335 @@ MAC16(b2, W7, col[5x8]); |
336 @@ MAC16(b3, W3, col[5x8]); | 336 @@ MAC16(b3, W3, col[5x8]); |
337 @@ MAC16(b1, -W1, col[5x8]); | 337 @@ MAC16(b1, -W1, col[5x8]); |
338 @@ MAC16(b0, W7, col[7x8]); | 338 @@ MAC16(b0, W7, col[7x8]); |
339 @@ MAC16(b2, W3, col[7x8]); | 339 @@ MAC16(b2, W3, col[7x8]); |
340 @@ MAC16(b3, -W1, col[7x8]); | 340 @@ MAC16(b3, -W1, col[7x8]); |
341 @@ MAC16(b1, -W5, col[7x8]); | 341 @@ MAC16(b1, -W5, col[7x8]); |
342 ldrsh r3, [r14, #80] @ R3=COLr16[5x8] | 342 ldrsh r3, [r14, #80] @ R3=COLr16[5x8] |
343 teq r3, #0 @ if 0 then avoid muls | 343 teq r3, #0 @ if 0 then avoid muls |
344 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 | 344 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 |
345 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 | 345 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 |
346 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 | 346 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 |
347 rsbne r3, r3, #0 @ R3=-ROWr16[5x8] | 347 rsbne r3, r3, #0 @ R3=-ROWr16[5x8] |
348 ldrsh r4, [r14, #112] @ R4=COLr16[7x8] | 348 ldrsh r4, [r14, #112] @ R4=COLr16[7x8] |
349 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 | 349 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 |
350 @@ R3 is free now | 350 @@ R3 is free now |
351 teq r4, #0 @ if 0 then avoid muls | 351 teq r4, #0 @ if 0 then avoid muls |
352 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 | 352 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 |
353 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 | 353 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 |
354 rsbne r4, r4, #0 @ R4=-ROWr16[7x8] | 354 rsbne r4, r4, #0 @ R4=-ROWr16[7x8] |
355 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 | 355 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 |
356 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 | 356 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 |
357 @@ R4 is free now | 357 @@ R4 is free now |
358 __end_b_evaluation2: | 358 __end_b_evaluation2: |
359 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), | 359 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), |
360 @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), | 360 @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), |
361 @@ R12=__const_ptr_, R14=&block[n] | 361 @@ R12=__const_ptr_, R14=&block[n] |
362 | 362 |
363 __a_evaluation2: | 363 __a_evaluation2: |
364 @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1)); | 364 @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1)); |
365 @@ a1 = a0 + W6 * row[2]; | 365 @@ a1 = a0 + W6 * row[2]; |
366 @@ a2 = a0 - W6 * row[2]; | 366 @@ a2 = a0 - W6 * row[2]; |
367 @@ a3 = a0 - W2 * row[2]; | 367 @@ a3 = a0 - W2 * row[2]; |
368 @@ a0 = a0 + W2 * row[2]; | 368 @@ a0 = a0 + W2 * row[2]; |
369 ldrsh r6, [r14, #0] | 369 ldrsh r6, [r14, #0] |
370 ldr r9, [r12, #offW4] @ R9=W4 | 370 ldr r9, [r12, #offW4] @ R9=W4 |
371 mul r6, r9, r6 @ R6=W4*ROWr16[0] | 371 mul r6, r9, r6 @ R6=W4*ROWr16[0] |
372 ldr r10, [r12, #offW6] @ R10=W6 | 372 ldr r10, [r12, #offW6] @ R10=W6 |
373 ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet) | 373 ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet) |
374 add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0) | 374 add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0) |
375 mul r11, r10, r4 @ R11=W6*ROWr16[2] | 375 mul r11, r10, r4 @ R11=W6*ROWr16[2] |
376 ldr r8, [r12, #offW2] @ R8=W2 | 376 ldr r8, [r12, #offW2] @ R8=W2 |
377 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) | 377 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) |
378 sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) | 378 sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) |
379 mul r11, r8, r4 @ R11=W2*ROWr16[2] | 379 mul r11, r8, r4 @ R11=W2*ROWr16[2] |
380 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) | 380 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) |
381 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) | 381 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) |
382 | 382 |
383 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, | 383 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, |
384 @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), | 384 @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), |
385 @@ R12=__const_ptr_, R14=&block[n] | 385 @@ R12=__const_ptr_, R14=&block[n] |
386 @@ a0 += W4*row[4] | 386 @@ a0 += W4*row[4] |
387 @@ a1 -= W4*row[4] | 387 @@ a1 -= W4*row[4] |
388 @@ a2 -= W4*row[4] | 388 @@ a2 -= W4*row[4] |
389 @@ a3 += W4*row[4] | 389 @@ a3 += W4*row[4] |
390 ldrsh r11, [r14, #64] @ R11=ROWr16[4] | 390 ldrsh r11, [r14, #64] @ R11=ROWr16[4] |
391 teq r11, #0 @ if null avoid muls | 391 teq r11, #0 @ if null avoid muls |
392 mulne r11, r9, r11 @ R11=W4*ROWr16[4] | 392 mulne r11, r9, r11 @ R11=W4*ROWr16[4] |
393 @@ R9 is free now | 393 @@ R9 is free now |
394 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) | 394 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) |
395 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) | 395 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) |
396 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) | 396 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) |
397 ldrsh r9, [r14, #96] @ R9=ROWr16[6] | 397 ldrsh r9, [r14, #96] @ R9=ROWr16[6] |
398 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) | 398 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) |
399 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead | 399 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead |
400 teq r9, #0 @ if null avoid muls | 400 teq r9, #0 @ if null avoid muls |
401 mulne r11, r10, r9 @ R11=W6*ROWr16[6] | 401 mulne r11, r10, r9 @ R11=W6*ROWr16[6] |
402 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) | 402 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) |
403 mulne r10, r8, r9 @ R10=W2*ROWr16[6] | 403 mulne r10, r8, r9 @ R10=W2*ROWr16[6] |
404 @@ a0 += W6*row[6]; | 404 @@ a0 += W6*row[6]; |
405 @@ a3 -= W6*row[6]; | 405 @@ a3 -= W6*row[6]; |
406 @@ a1 -= W2*row[6]; | 406 @@ a1 -= W2*row[6]; |
407 @@ a2 += W2*row[6]; | 407 @@ a2 += W2*row[6]; |
408 subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) | 408 subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) |
409 subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) | 409 subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) |
410 addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) | 410 addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) |
411 __end_a_evaluation2: | 411 __end_a_evaluation2: |
412 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, | 412 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, |
413 @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), | 413 @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), |
414 @@ R12=__const_ptr_, R14=&block[n] | 414 @@ R12=__const_ptr_, R14=&block[n] |
415 @@ col[0 ] = ((a0 + b0) >> COL_SHIFT); | 415 @@ col[0 ] = ((a0 + b0) >> COL_SHIFT); |
416 @@ col[8 ] = ((a1 + b1) >> COL_SHIFT); | 416 @@ col[8 ] = ((a1 + b1) >> COL_SHIFT); |
417 @@ col[16] = ((a2 + b2) >> COL_SHIFT); | 417 @@ col[16] = ((a2 + b2) >> COL_SHIFT); |
418 @@ col[24] = ((a3 + b3) >> COL_SHIFT); | 418 @@ col[24] = ((a3 + b3) >> COL_SHIFT); |
419 @@ col[32] = ((a3 - b3) >> COL_SHIFT); | 419 @@ col[32] = ((a3 - b3) >> COL_SHIFT); |
420 @@ col[40] = ((a2 - b2) >> COL_SHIFT); | 420 @@ col[40] = ((a2 - b2) >> COL_SHIFT); |
421 @@ col[48] = ((a1 - b1) >> COL_SHIFT); | 421 @@ col[48] = ((a1 - b1) >> COL_SHIFT); |
422 @@ col[56] = ((a0 - b0) >> COL_SHIFT); | 422 @@ col[56] = ((a0 - b0) >> COL_SHIFT); |
423 @@@@@ no optimisation here @@@@@ | 423 @@@@@ no optimisation here @@@@@ |
424 add r8, r6, r0 @ R8=a0+b0 | 424 add r8, r6, r0 @ R8=a0+b0 |
425 add r9, r2, r1 @ R9=a1+b1 | 425 add r9, r2, r1 @ R9=a1+b1 |
426 mov r8, r8, asr #COL_SHIFT | 426 mov r8, r8, asr #COL_SHIFT |
427 mov r9, r9, asr #COL_SHIFT | 427 mov r9, r9, asr #COL_SHIFT |
428 strh r8, [r14, #0] | 428 strh r8, [r14, #0] |
429 strh r9, [r14, #16] | 429 strh r9, [r14, #16] |
430 add r8, r3, r5 @ R8=a2+b2 | 430 add r8, r3, r5 @ R8=a2+b2 |
431 add r9, r4, r7 @ R9=a3+b3 | 431 add r9, r4, r7 @ R9=a3+b3 |
432 mov r8, r8, asr #COL_SHIFT | 432 mov r8, r8, asr #COL_SHIFT |
433 mov r9, r9, asr #COL_SHIFT | 433 mov r9, r9, asr #COL_SHIFT |
434 strh r8, [r14, #32] | 434 strh r8, [r14, #32] |
435 strh r9, [r14, #48] | 435 strh r9, [r14, #48] |
436 sub r8, r4, r7 @ R8=a3-b3 | 436 sub r8, r4, r7 @ R8=a3-b3 |
437 sub r9, r3, r5 @ R9=a2-b2 | 437 sub r9, r3, r5 @ R9=a2-b2 |
438 mov r8, r8, asr #COL_SHIFT | 438 mov r8, r8, asr #COL_SHIFT |
439 mov r9, r9, asr #COL_SHIFT | 439 mov r9, r9, asr #COL_SHIFT |
440 strh r8, [r14, #64] | 440 strh r8, [r14, #64] |
441 strh r9, [r14, #80] | 441 strh r9, [r14, #80] |
442 sub r8, r2, r1 @ R8=a1-b1 | 442 sub r8, r2, r1 @ R8=a1-b1 |
443 sub r9, r6, r0 @ R9=a0-b0 | 443 sub r9, r6, r0 @ R9=a0-b0 |
444 mov r8, r8, asr #COL_SHIFT | 444 mov r8, r8, asr #COL_SHIFT |
445 mov r9, r9, asr #COL_SHIFT | 445 mov r9, r9, asr #COL_SHIFT |
446 strh r8, [r14, #96] | 446 strh r8, [r14, #96] |
447 strh r9, [r14, #112] | 447 strh r9, [r14, #112] |
448 | 448 |
449 __end_col_loop: | 449 __end_col_loop: |
450 @@ at this point, R0-R11 (free) | 450 @@ at this point, R0-R11 (free) |
451 @@ R12=__const_ptr_, R14=&block[n] | 451 @@ R12=__const_ptr_, R14=&block[n] |
452 ldr r0, [sp, #0] @ R0=block | 452 ldr r0, [sp, #0] @ R0=block |
453 teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished. | 453 teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished. |
454 sub r14, r14, #2 | 454 sub r14, r14, #2 |
455 bne __col_loop | 455 bne __col_loop |
456 | 456 |
457 | 457 |
458 | 458 |
459 | 459 |
460 __end_simple_idct_ARM: | 460 __end_simple_idct_ARM: |
464 | 464 |
465 | 465 |
466 | 466 |
467 @@ kind of sub-function, here not to overload the common case. | 467 @@ kind of sub-function, here not to overload the common case. |
468 __end_bef_a_evaluation: | 468 __end_bef_a_evaluation: |
469 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) | 469 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) |
470 mul r11, r8, r4 @ R11=W2*ROWr16[2] | 470 mul r11, r8, r4 @ R11=W2*ROWr16[2] |
471 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) | 471 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) |
472 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) | 472 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) |
473 bal __end_a_evaluation | 473 bal __end_a_evaluation |
474 | 474 |
475 | 475 |
476 __constant_ptr__: @@ see #defines at the beginning of the source code for values. | 476 __constant_ptr__: @@ see #defines at the beginning of the source code for values. |
477 .align | 477 .align |
478 .word W1 | 478 .word W1 |
479 .word W2 | 479 .word W2 |
480 .word W3 | 480 .word W3 |
481 .word W4 | 481 .word W4 |
482 .word W5 | 482 .word W5 |