Mercurial > libavcodec.hg
comparison armv4l/simple_idct_armv6.S @ 4483:7a56dc39adef libavcodec
oops, revert accidental checkin
author | mru |
---|---|
date | Mon, 05 Feb 2007 21:18:11 +0000 |
parents | ee7422a921cb |
children | 97d82c7585b4 |
comparison
equal
deleted
inserted
replaced
4482:ee7422a921cb | 4483:7a56dc39adef |
---|---|
45 w42: .long W42 | 45 w42: .long W42 |
46 w42n: .long W42n | 46 w42n: .long W42n |
47 w46: .long W46 | 47 w46: .long W46 |
48 w57: .long W57 | 48 w57: .long W57 |
49 | 49 |
50 .macro idct_row_start shift | 50 /* |
51 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ | 51 Compute partial IDCT of single row. |
52 shift = left-shift amount | |
53 a1 = source address | |
54 a3 = row[2,0] <= 2 cycles | |
55 a4 = row[3,1] | |
56 ip = w42 <= 2 cycles | |
57 | |
58 Output in registers v1--v8 | |
59 */ | |
60 .macro idct_row shift | |
61 ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ | |
62 mov a2, #(1<<(\shift-1)) | |
63 smlad v1, a3, ip, a2 | |
64 smlsd v4, a3, ip, a2 | |
65 ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ | |
66 ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ | |
67 smlad v2, a3, lr, a2 | |
68 smlsd v3, a3, lr, a2 | |
69 | |
70 smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ | |
71 smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ | |
72 ldr lr, [a1, #12] /* lr = row[7,5] */ | |
73 pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ | |
74 pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ | |
75 smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ | |
76 smlad v5, lr, v7, v5 /* B0 += W5*row[5] + W7*row[7] */ | |
77 smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ | |
78 | |
79 ldr a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */ | |
80 smlad v7, lr, a3, v7 /* B2 += W7*row[5] + W3*row[7] */ | |
81 ldr a3, [a1, #4] /* a3 = row[6,4] */ | |
82 smlsdx fp, lr, ip, fp /* B3 += W3*row[5] - W1*row[7] */ | |
83 ldr ip, [pc, #(w46-.-8)] /* ip = W4 | (W6 << 16) */ | |
84 smlad v6, lr, a2, v6 /* B1 -= W1*row[5] + W5*row[7] */ | |
85 | |
86 smlad v2, a3, a4, v2 /* A1 += -W4*row[4] - W2*row[6] */ | |
87 smlsd v3, a3, a4, v3 /* A2 += -W4*row[4] + W2*row[6] */ | |
88 smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */ | |
89 smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */ | |
90 .endm | |
91 | |
92 /* | |
93 Compute partial IDCT of half row. | |
94 shift = left-shift amount | |
95 a3 = row[2,0] | |
96 a4 = row[3,1] | |
97 ip = w42 | |
98 | |
99 Output in registers v1--v8 | |
100 */ | |
101 .macro idct_row4 shift | |
52 ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ | 102 ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ |
53 ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ | 103 ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ |
54 mov a2, #(1<<(\shift-1)) | 104 mov a2, #(1<<(\shift-1)) |
55 smlad v1, a3, ip, a2 | 105 smlad v1, a3, ip, a2 |
56 smlsd v4, a3, ip, a2 | 106 smlsd v4, a3, ip, a2 |
57 ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ | 107 ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ |
58 smlad v2, a3, lr, a2 | 108 smlad v2, a3, lr, a2 |
59 smlsd v3, a3, lr, a2 | 109 smlsd v3, a3, lr, a2 |
60 smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ | 110 smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ |
61 smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ | 111 smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ |
62 .endm | |
63 /* | |
64 Compute partial IDCT of single row. | |
65 shift = left-shift amount | |
66 a1 = source address | |
67 a3 = row[2,0] <= 2 cycles | |
68 a4 = row[3,1] | |
69 ip = w42 <= 2 cycles | |
70 | |
71 Output in registers v1--v8 | |
72 */ | |
73 .macro idct_row shift | |
74 ldr lr, [a1, #12] /* lr = row[7,5] */ | |
75 pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ | |
76 pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ | |
77 smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ | |
78 smlad v5, lr, v7, v5 /* B0 += W5*row[5] + W7*row[7] */ | |
79 smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ | |
80 | |
81 ldr a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */ | |
82 smlad v7, lr, a3, v7 /* B2 += W7*row[5] + W3*row[7] */ | |
83 ldr a3, [a1, #4] /* a3 = row[6,4] */ | |
84 smlsdx fp, lr, ip, fp /* B3 += W3*row[5] - W1*row[7] */ | |
85 ldr ip, [pc, #(w46-.-8)] /* ip = W4 | (W6 << 16) */ | |
86 smlad v6, lr, a2, v6 /* B1 -= W1*row[5] + W5*row[7] */ | |
87 | |
88 smlad v2, a3, a4, v2 /* A1 += -W4*row[4] - W2*row[6] */ | |
89 smlsd v3, a3, a4, v3 /* A2 += -W4*row[4] + W2*row[6] */ | |
90 smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */ | |
91 smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */ | |
92 .endm | |
93 | |
94 /* | |
95 Compute partial IDCT of half row. | |
96 shift = left-shift amount | |
97 a3 = row[2,0] | |
98 a4 = row[3,1] | |
99 ip = w42 | |
100 | |
101 Output in registers v1--v8 | |
102 */ | |
103 .macro idct_row4 shift | |
104 pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ | 112 pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ |
105 pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ | 113 pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ |
106 smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ | 114 smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ |
107 smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ | 115 smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ |
108 .endm | 116 .endm |
183 a2 = dest | 191 a2 = dest |
184 */ | 192 */ |
185 .align | 193 .align |
186 .func idct_row_armv6 | 194 .func idct_row_armv6 |
187 idct_row_armv6: | 195 idct_row_armv6: |
188 ldr fp, [a1, #12] /* fp = row[7,5] */ | 196 str lr, [sp, #-4]! |
189 ldr v7, [a1, #4] /* v7 = row[6,4] */ | 197 |
198 ldr lr, [a1, #12] /* lr = row[7,5] */ | |
199 ldr ip, [a1, #4] /* ip = row[6,4] */ | |
190 ldr a4, [a1, #8] /* a4 = row[3,1] */ | 200 ldr a4, [a1, #8] /* a4 = row[3,1] */ |
191 ldr a3, [a1] /* a3 = row[2,0] */ | 201 ldr a3, [a1] /* a3 = row[2,0] */ |
192 mov ip, #(1<<(ROW_SHIFT-1)) | 202 orrs lr, lr, ip |
193 orrs v5, fp, v7 | 203 cmpeq lr, a4 |
194 cmpeq v5, a4 | 204 cmpeq lr, a3, lsr #16 |
195 cmpeq v5, a3, lsr #16 | |
196 beq 1f | 205 beq 1f |
197 cmp v5, #0 | 206 str a2, [sp, #-4]! |
198 stmfd sp!, {a2, lr} | 207 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ |
199 ldr v5, [pc, #(w42-.-8)] /* v5 = W4 | (W2 << 16) */ | 208 cmp lr, #0 |
200 ldr v6, [pc, #(w46-.-8)] /* v6 = W4 | (W6 << 16) */ | 209 beq 2f |
201 ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ | 210 |
202 | 211 idct_row ROW_SHIFT |
203 smlad v1, a3, v5, ip | 212 b 3f |
204 smlsd v4, a3, v5, ip | 213 |
205 ldr a2, [pc, #(w13-.-8)] /* a2 = W1 | (W3 << 16) */ | 214 2: idct_row4 ROW_SHIFT |
206 smlad v2, a3, v6, ip | |
207 smlsd v3, a3, v6, ip | |
208 smusdx lr, a4, v7 /* lr = B3 = W7*row[1] - W5*row[3] */ | |
209 smuad v5, a4, a2 /* v5 = B0 = W1*row[1] + W3*row[3] */ | |
210 | |
211 pkhtb a3, a2, v7, asr #16 /* a3 = W7 | (W3 << 16) */ | |
212 pkhbt ip, a2, v7, lsl #16 /* ip = W1 | (W5 << 16) */ | |
213 smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ | |
214 smusdx a4, a4, ip /* v7 = B2 = W5*row[1] - W1*row[3] */ | |
215 beq 3f | |
216 | |
217 smlad v5, fp, v7, v5 /* B0 += W5*row[5] + W7*row[7] */ | |
218 smlad v7, fp, a3, a4 /* B2 += W7*row[5] + W3*row[7] */ | |
219 ldr a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */ | |
220 ldr a3, [a1, #4] /* a3 = row[6,4] */ | |
221 smlsdx lr, fp, a2, lr /* B3 += W3*row[5] - W1*row[7] */ | |
222 ldr a2, [pc, #(w46-.-8)] /* a2 = W4 | (W6 << 16) */ | |
223 smlad v6, fp, ip, v6 /* B1 -= W1*row[5] + W5*row[7] */ | |
224 | |
225 smlad v2, a3, a4, v2 /* A1 += -W4*row[4] - W2*row[6] */ | |
226 smlsd v3, a3, a4, v3 /* A2 += -W4*row[4] + W2*row[6] */ | |
227 smlad v1, a3, a2, v1 /* A0 += W4*row[4] + W6*row[6] */ | |
228 smlsd v4, a3, a2, v4 /* A3 += W4*row[4] - W6*row[6] */ | |
229 | |
230 ldr a2, [sp], #4 | |
231 add a4, v1, v5 /* a4 = A0 + B0 */ | |
232 sub a3, v1, v5 /* a3 = A0 - B0 */ | |
233 mov v1, a4, asr #ROW_SHIFT | |
234 mov v5, a3, asr #ROW_SHIFT | |
235 | |
236 sub a4, v2, v6 /* a4 = A1 + B1 */ | |
237 add a3, v2, v6 /* a3 = A1 - B1 */ | |
238 mov v2, a4, asr #ROW_SHIFT | |
239 mov v6, a3, asr #ROW_SHIFT | |
240 | |
241 add a4, v3, v7 /* a4 = A2 + B2 */ | |
242 sub a3, v3, v7 /* a3 = A2 - B2 */ | |
243 mov v3, a4, asr #ROW_SHIFT | |
244 mov v7, a3, asr #ROW_SHIFT | |
245 | |
246 add a4, v4, lr /* a4 = A3 + B3 */ | |
247 sub a3, v4, lr /* a3 = A3 - B3 */ | |
248 mov v4, a4, asr #ROW_SHIFT | |
249 mov fp, a3, asr #ROW_SHIFT | |
250 | |
251 strh v1, [a2] | |
252 strh v2, [a2, #(16*2)] | |
253 strh v3, [a2, #(16*4)] | |
254 strh v4, [a2, #(16*6)] | |
255 strh fp, [a2, #(16*1)] | |
256 strh v7, [a2, #(16*3)] | |
257 strh v6, [a2, #(16*5)] | |
258 strh v5, [a2, #(16*7)] | |
259 | |
260 ldr pc, [sp], #4 | |
261 | 215 |
262 3: ldr a2, [sp], #4 | 216 3: ldr a2, [sp], #4 |
263 add v7, v1, v5 /* v7 = A0 + B0 */ | 217 idct_finish_shift ROW_SHIFT |
264 sub a3, v1, v5 /* a3 = A0 - B0 */ | |
265 mov v1, v7, asr #ROW_SHIFT | |
266 mov v5, a3, asr #ROW_SHIFT | |
267 | |
268 sub v7, v2, v6 /* v7 = A1 + B1 */ | |
269 add a3, v2, v6 /* a3 = A1 - B1 */ | |
270 mov v2, v7, asr #ROW_SHIFT | |
271 mov v6, a3, asr #ROW_SHIFT | |
272 | |
273 add v7, v3, a4 /* v7 = A2 + B2 */ | |
274 sub a3, v3, a4 /* a3 = A2 - B2 */ | |
275 mov v3, v7, asr #ROW_SHIFT | |
276 mov v7, a3, asr #ROW_SHIFT | |
277 | |
278 add a4, v4, lr /* xx = A3 + B3 */ | |
279 sub a3, v4, lr /* a3 = A3 - B3 */ | |
280 mov v4, a4, asr #ROW_SHIFT | |
281 mov fp, a3, asr #ROW_SHIFT | |
282 | 218 |
283 strh v1, [a2] | 219 strh v1, [a2] |
284 strh v2, [a2, #(16*2)] | 220 strh v2, [a2, #(16*2)] |
285 strh v3, [a2, #(16*4)] | 221 strh v3, [a2, #(16*4)] |
286 strh v4, [a2, #(16*6)] | 222 strh v4, [a2, #(16*6)] |
298 strh a3, [a2, #(16*6)] | 234 strh a3, [a2, #(16*6)] |
299 strh a3, [a2, #(16*1)] | 235 strh a3, [a2, #(16*1)] |
300 strh a3, [a2, #(16*3)] | 236 strh a3, [a2, #(16*3)] |
301 strh a3, [a2, #(16*5)] | 237 strh a3, [a2, #(16*5)] |
302 strh a3, [a2, #(16*7)] | 238 strh a3, [a2, #(16*7)] |
303 mov pc, lr | 239 ldr pc, [sp], #4 |
304 .endfunc | 240 .endfunc |
305 | 241 |
306 /* | 242 /* |
307 Compute IDCT of single column, read as row. | 243 Compute IDCT of single column, read as row. |
308 a1 = source | 244 a1 = source |
312 .func idct_col_armv6 | 248 .func idct_col_armv6 |
313 idct_col_armv6: | 249 idct_col_armv6: |
314 stmfd sp!, {a2, lr} | 250 stmfd sp!, {a2, lr} |
315 | 251 |
316 ldr a3, [a1] /* a3 = row[2,0] */ | 252 ldr a3, [a1] /* a3 = row[2,0] */ |
253 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ | |
317 ldr a4, [a1, #8] /* a4 = row[3,1] */ | 254 ldr a4, [a1, #8] /* a4 = row[3,1] */ |
318 idct_row_start COL_SHIFT | |
319 idct_row COL_SHIFT | 255 idct_row COL_SHIFT |
320 ldr a2, [sp], #4 | 256 ldr a2, [sp], #4 |
321 idct_finish_shift COL_SHIFT | 257 idct_finish_shift COL_SHIFT |
322 | 258 |
323 strh v1, [a2] | 259 strh v1, [a2] |
342 .func idct_col_put_armv6 | 278 .func idct_col_put_armv6 |
343 idct_col_put_armv6: | 279 idct_col_put_armv6: |
344 stmfd sp!, {a2, a3, lr} | 280 stmfd sp!, {a2, a3, lr} |
345 | 281 |
346 ldr a3, [a1] /* a3 = row[2,0] */ | 282 ldr a3, [a1] /* a3 = row[2,0] */ |
283 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ | |
347 ldr a4, [a1, #8] /* a4 = row[3,1] */ | 284 ldr a4, [a1, #8] /* a4 = row[3,1] */ |
348 idct_row_start COL_SHIFT | |
349 idct_row COL_SHIFT | 285 idct_row COL_SHIFT |
350 ldmfd sp!, {a2, a3} | 286 ldmfd sp!, {a2, a3} |
351 idct_finish_shift_sat COL_SHIFT | 287 idct_finish_shift_sat COL_SHIFT |
352 | 288 |
353 strb v1, [a2], a3 | 289 strb v1, [a2], a3 |
374 .func idct_col_add_armv6 | 310 .func idct_col_add_armv6 |
375 idct_col_add_armv6: | 311 idct_col_add_armv6: |
376 stmfd sp!, {a2, a3, lr} | 312 stmfd sp!, {a2, a3, lr} |
377 | 313 |
378 ldr a3, [a1] /* a3 = row[2,0] */ | 314 ldr a3, [a1] /* a3 = row[2,0] */ |
315 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ | |
379 ldr a4, [a1, #8] /* a4 = row[3,1] */ | 316 ldr a4, [a1, #8] /* a4 = row[3,1] */ |
380 idct_row_start COL_SHIFT | |
381 idct_row COL_SHIFT | 317 idct_row COL_SHIFT |
382 ldmfd sp!, {a2, a3} | 318 ldmfd sp!, {a2, a3} |
383 idct_finish | 319 idct_finish |
384 | 320 |
385 ldrb a4, [a2] | 321 ldrb a4, [a2] |