comparison armv4l/simple_idct_armv6.S @ 4452:c66326f1f635 libavcodec

optimize IDCT of rows with mostly zero coefficients
author mru
date Wed, 31 Jan 2007 23:04:56 +0000
parents cab2986ffc0b
children 99e960878498
comparison
equal deleted inserted replaced
4451:9fa2c8a7e4d8 4452:c66326f1f635
88 smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */ 88 smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */
89 smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */ 89 smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */
90 .endm 90 .endm
91 91
92 /* 92 /*
93 Compute partial IDCT of half row.
94 shift = left-shift amount
95 a3 = row[2,0]
96 a4 = row[3,1]
97
98 Output in registers v1--v8
99 */
100 .macro idct_row4 shift
101 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
102 ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */
103 ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */
104 mov a2, #(1<<(\shift-1))
105 smlad v1, a3, ip, a2
106 smlsd v4, a3, ip, a2
107 ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */
108 smlad v2, a3, lr, a2
109 smlsd v3, a3, lr, a2
110 smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */
111 smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */
112 pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */
113 pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */
114 smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */
115 smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */
116 .endm
117
118 /*
93 Compute final part of IDCT single row without shift. 119 Compute final part of IDCT single row without shift.
94 Input in registers v1--v8 120 Input in registers v1--v8
95 Output in registers ip, v1--v3, lr, v5--v7 121 Output in registers ip, v1--v3, lr, v5--v7
96 */ 122 */
97 .macro idct_finish 123 .macro idct_finish
165 a2 = dest 191 a2 = dest
166 */ 192 */
167 .align 193 .align
168 .func idct_row_armv6 194 .func idct_row_armv6
169 idct_row_armv6: 195 idct_row_armv6:
170 stmfd sp!, {a2, lr} 196 str lr, [sp, #-4]!
197
198 ldr lr, [a1, #12] /* lr = row[7,5] */
199 ldr ip, [a1, #4] /* ip = row[6,4] */
200 ldr a4, [a1, #8] /* a4 = row[3,1] */
201 ldr a3, [a1] /* a3 = row[2,0] */
202 orrs lr, lr, ip
203 cmpeq lr, a4
204 cmpeq lr, a3, lsr #16
205 beq 1f
206 str a2, [sp, #-4]!
207 cmp lr, #0
208 beq 2f
171 209
172 idct_row ROW_SHIFT 210 idct_row ROW_SHIFT
173 ldr a2, [sp], #4 211 b 3f
212
213 2: idct_row4 ROW_SHIFT
214
215 3: ldr a2, [sp], #4
174 idct_finish_shift ROW_SHIFT 216 idct_finish_shift ROW_SHIFT
175 217
176 strh v1, [a2] 218 strh v1, [a2]
177 strh v2, [a2, #(16*2)] 219 strh v2, [a2, #(16*2)]
178 strh v3, [a2, #(16*4)] 220 strh v3, [a2, #(16*4)]
180 strh fp, [a2, #(16*1)] 222 strh fp, [a2, #(16*1)]
181 strh v7, [a2, #(16*3)] 223 strh v7, [a2, #(16*3)]
182 strh v6, [a2, #(16*5)] 224 strh v6, [a2, #(16*5)]
183 strh v5, [a2, #(16*7)] 225 strh v5, [a2, #(16*7)]
184 226
227 ldr pc, [sp], #4
228
229 1: mov a3, a3, lsl #3
230 strh a3, [a2]
231 strh a3, [a2, #(16*2)]
232 strh a3, [a2, #(16*4)]
233 strh a3, [a2, #(16*6)]
234 strh a3, [a2, #(16*1)]
235 strh a3, [a2, #(16*3)]
236 strh a3, [a2, #(16*5)]
237 strh a3, [a2, #(16*7)]
185 ldr pc, [sp], #4 238 ldr pc, [sp], #4
186 .endfunc 239 .endfunc
187 240
188 /* 241 /*
189 Compute IDCT of single column, read as row. 242 Compute IDCT of single column, read as row.