Mercurial > libavcodec.hg
comparison armv4l/simple_idct_armv6.S @ 4452:c66326f1f635 libavcodec
optimize IDCT of rows with mostly zero coefficients
author | mru |
---|---|
date | Wed, 31 Jan 2007 23:04:56 +0000 |
parents | cab2986ffc0b |
children | 99e960878498 |
comparison
equal
deleted
inserted
replaced
4451:9fa2c8a7e4d8 | 4452:c66326f1f635 |
---|---|
88 smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */ | 88 smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */ |
89 smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */ | 89 smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */ |
90 .endm | 90 .endm |
91 | 91 |
92 /* | 92 /* |
93 Compute partial IDCT of half row. | |
94 shift = left-shift amount | |
95 a3 = row[2,0] | |
96 a4 = row[3,1] | |
97 | |
98 Output in registers v1--v8 | |
99 */ | |
100 .macro idct_row4 shift | |
101 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ | |
102 ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ | |
103 ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ | |
104 mov a2, #(1<<(\shift-1)) | |
105 smlad v1, a3, ip, a2 | |
106 smlsd v4, a3, ip, a2 | |
107 ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ | |
108 smlad v2, a3, lr, a2 | |
109 smlsd v3, a3, lr, a2 | |
110 smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ | |
111 smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ | |
112 pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ | |
113 pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ | |
114 smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ | |
115 smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ | |
116 .endm | |
117 | |
118 /* | |
93 Compute final part of IDCT single row without shift. | 119 Compute final part of IDCT single row without shift. |
94 Input in registers v1--v8 | 120 Input in registers v1--v8 |
95 Output in registers ip, v1--v3, lr, v5--v7 | 121 Output in registers ip, v1--v3, lr, v5--v7 |
96 */ | 122 */ |
97 .macro idct_finish | 123 .macro idct_finish |
165 a2 = dest | 191 a2 = dest |
166 */ | 192 */ |
167 .align | 193 .align |
168 .func idct_row_armv6 | 194 .func idct_row_armv6 |
169 idct_row_armv6: | 195 idct_row_armv6: |
170 stmfd sp!, {a2, lr} | 196 str lr, [sp, #-4]! |
197 | |
198 ldr lr, [a1, #12] /* lr = row[7,5] */ | |
199 ldr ip, [a1, #4] /* ip = row[6,4] */ | |
200 ldr a4, [a1, #8] /* a4 = row[3,1] */ | |
201 ldr a3, [a1] /* a3 = row[2,0] */ | |
202 orrs lr, lr, ip | |
203 cmpeq lr, a4 | |
204 cmpeq lr, a3, lsr #16 | |
205 beq 1f | |
206 str a2, [sp, #-4]! | |
207 cmp lr, #0 | |
208 beq 2f | |
171 | 209 |
172 idct_row ROW_SHIFT | 210 idct_row ROW_SHIFT |
173 ldr a2, [sp], #4 | 211 b 3f |
212 | |
213 2: idct_row4 ROW_SHIFT | |
214 | |
215 3: ldr a2, [sp], #4 | |
174 idct_finish_shift ROW_SHIFT | 216 idct_finish_shift ROW_SHIFT |
175 | 217 |
176 strh v1, [a2] | 218 strh v1, [a2] |
177 strh v2, [a2, #(16*2)] | 219 strh v2, [a2, #(16*2)] |
178 strh v3, [a2, #(16*4)] | 220 strh v3, [a2, #(16*4)] |
180 strh fp, [a2, #(16*1)] | 222 strh fp, [a2, #(16*1)] |
181 strh v7, [a2, #(16*3)] | 223 strh v7, [a2, #(16*3)] |
182 strh v6, [a2, #(16*5)] | 224 strh v6, [a2, #(16*5)] |
183 strh v5, [a2, #(16*7)] | 225 strh v5, [a2, #(16*7)] |
184 | 226 |
227 ldr pc, [sp], #4 | |
228 | |
229 1: mov a3, a3, lsl #3 | |
230 strh a3, [a2] | |
231 strh a3, [a2, #(16*2)] | |
232 strh a3, [a2, #(16*4)] | |
233 strh a3, [a2, #(16*6)] | |
234 strh a3, [a2, #(16*1)] | |
235 strh a3, [a2, #(16*3)] | |
236 strh a3, [a2, #(16*5)] | |
237 strh a3, [a2, #(16*7)] | |
185 ldr pc, [sp], #4 | 238 ldr pc, [sp], #4 |
186 .endfunc | 239 .endfunc |
187 | 240 |
188 /* | 241 /* |
189 Compute IDCT of single column, read as row. | 242 Compute IDCT of single column, read as row. |