Mercurial > libavcodec.hg
comparison arm/simple_idct_armv6.S @ 8575:0b9dff3a1ce2 libavcodec
ARM: use rX register names in simple_idct_armv6.S
author | mru |
---|---|
date | Mon, 12 Jan 2009 20:37:29 +0000 |
parents | 9281a8a9387a |
children | 636dc45f4779 |
comparison
equal
deleted
inserted
replaced
8574:d679fd3a5359 | 8575:0b9dff3a1ce2 |
---|---|
50 w57: .long W57 | 50 w57: .long W57 |
51 | 51 |
52 /* | 52 /* |
53 Compute partial IDCT of single row. | 53 Compute partial IDCT of single row. |
54 shift = left-shift amount | 54 shift = left-shift amount |
55 a1 = source address | 55 r0 = source address |
56 a3 = row[2,0] <= 2 cycles | 56 r2 = row[2,0] <= 2 cycles |
57 a4 = row[3,1] | 57 r3 = row[3,1] |
58 ip = w42 <= 2 cycles | 58 ip = w42 <= 2 cycles |
59 | 59 |
60 Output in registers v1--v8 | 60 Output in registers r4--r11 |
61 */ | 61 */ |
62 .macro idct_row shift | 62 .macro idct_row shift |
63 ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ | 63 ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ |
64 mov a2, #(1<<(\shift-1)) | 64 mov r1, #(1<<(\shift-1)) |
65 smlad v1, a3, ip, a2 | 65 smlad r4, r2, ip, r1 |
66 smlsd v4, a3, ip, a2 | 66 smlsd r7, r2, ip, r1 |
67 ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ | 67 ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ |
68 ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ | 68 ldr r10,[pc, #(w57-.-8)] /* r10 = W5 | (W7 << 16) */ |
69 smlad v2, a3, lr, a2 | 69 smlad r5, r2, lr, r1 |
70 smlsd v3, a3, lr, a2 | 70 smlsd r6, r2, lr, r1 |
71 | 71 |
72 smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ | 72 smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ |
73 smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ | 73 smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ |
74 ldr lr, [a1, #12] /* lr = row[7,5] */ | 74 ldr lr, [r0, #12] /* lr = row[7,5] */ |
75 pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ | 75 pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ |
76 pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ | 76 pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ |
77 smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ | 77 smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ |
78 smlad v5, lr, v7, v5 /* B0 += W5*row[5] + W7*row[7] */ | 78 smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */ |
79 smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ | 79 smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ |
80 | 80 |
81 ldr a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */ | 81 ldr r3, [pc, #(w42n-.-8)] /* r3 = -W4 | (-W2 << 16) */ |
82 smlad v7, lr, a3, v7 /* B2 += W7*row[5] + W3*row[7] */ | 82 smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */ |
83 ldr a3, [a1, #4] /* a3 = row[6,4] */ | 83 ldr r2, [r0, #4] /* r2 = row[6,4] */ |
84 smlsdx fp, lr, ip, fp /* B3 += W3*row[5] - W1*row[7] */ | 84 smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */ |
85 ldr ip, [pc, #(w46-.-8)] /* ip = W4 | (W6 << 16) */ | 85 ldr ip, [pc, #(w46-.-8)] /* ip = W4 | (W6 << 16) */ |
86 smlad v6, lr, a2, v6 /* B1 -= W1*row[5] + W5*row[7] */ | 86 smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */ |
87 | 87 |
88 smlad v2, a3, a4, v2 /* A1 += -W4*row[4] - W2*row[6] */ | 88 smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */ |
89 smlsd v3, a3, a4, v3 /* A2 += -W4*row[4] + W2*row[6] */ | 89 smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */ |
90 smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */ | 90 smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */ |
91 smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */ | 91 smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */ |
92 .endm | 92 .endm |
93 | 93 |
94 /* | 94 /* |
95 Compute partial IDCT of half row. | 95 Compute partial IDCT of half row. |
96 shift = left-shift amount | 96 shift = left-shift amount |
97 a3 = row[2,0] | 97 r2 = row[2,0] |
98 a4 = row[3,1] | 98 r3 = row[3,1] |
99 ip = w42 | 99 ip = w42 |
100 | 100 |
101 Output in registers v1--v8 | 101 Output in registers r4--r11 |
102 */ | 102 */ |
103 .macro idct_row4 shift | 103 .macro idct_row4 shift |
104 ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ | 104 ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ |
105 ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ | 105 ldr r10,[pc, #(w57-.-8)] /* r10 = W5 | (W7 << 16) */ |
106 mov a2, #(1<<(\shift-1)) | 106 mov r1, #(1<<(\shift-1)) |
107 smlad v1, a3, ip, a2 | 107 smlad r4, r2, ip, r1 |
108 smlsd v4, a3, ip, a2 | 108 smlsd r7, r2, ip, r1 |
109 ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ | 109 ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ |
110 smlad v2, a3, lr, a2 | 110 smlad r5, r2, lr, r1 |
111 smlsd v3, a3, lr, a2 | 111 smlsd r6, r2, lr, r1 |
112 smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ | 112 smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ |
113 smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ | 113 smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ |
114 pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ | 114 pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ |
115 pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ | 115 pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ |
116 smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ | 116 smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ |
117 smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ | 117 smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ |
118 .endm | 118 .endm |
119 | 119 |
120 /* | 120 /* |
121 Compute final part of IDCT single row without shift. | 121 Compute final part of IDCT single row without shift. |
122 Input in registers v1--v8 | 122 Input in registers r4--r11 |
123 Output in registers ip, v1--v3, lr, v5--v7 | 123 Output in registers ip, r4--r6, lr, r8--r10 |
124 */ | 124 */ |
125 .macro idct_finish | 125 .macro idct_finish |
126 add ip, v1, v5 /* a2 = A0 + B0 */ | 126 add ip, r4, r8 /* r1 = A0 + B0 */ |
127 sub lr, v1, v5 /* a3 = A0 - B0 */ | 127 sub lr, r4, r8 /* r2 = A0 - B0 */ |
128 sub v1, v2, v6 /* a3 = A1 + B1 */ | 128 sub r4, r5, r9 /* r2 = A1 + B1 */ |
129 add v5, v2, v6 /* a3 = A1 - B1 */ | 129 add r8, r5, r9 /* r2 = A1 - B1 */ |
130 add v2, v3, v7 /* a2 = A2 + B2 */ | 130 add r5, r6, r10 /* r1 = A2 + B2 */ |
131 sub v6, v3, v7 /* a2 = A2 - B2 */ | 131 sub r9, r6, r10 /* r1 = A2 - B2 */ |
132 add v3, v4, fp /* a3 = A3 + B3 */ | 132 add r6, r7, r11 /* r2 = A3 + B3 */ |
133 sub v7, v4, fp /* a3 = A3 - B3 */ | 133 sub r10,r7, r11 /* r2 = A3 - B3 */ |
134 .endm | 134 .endm |
135 | 135 |
136 /* | 136 /* |
137 Compute final part of IDCT single row. | 137 Compute final part of IDCT single row. |
138 shift = right-shift amount | 138 shift = right-shift amount |
139 Input/output in registers v1--v8 | 139 Input/output in registers r4--r11 |
140 */ | 140 */ |
141 .macro idct_finish_shift shift | 141 .macro idct_finish_shift shift |
142 add a4, v1, v5 /* a4 = A0 + B0 */ | 142 add r3, r4, r8 /* r3 = A0 + B0 */ |
143 sub a3, v1, v5 /* a3 = A0 - B0 */ | 143 sub r2, r4, r8 /* r2 = A0 - B0 */ |
144 mov v1, a4, asr #\shift | 144 mov r4, r3, asr #\shift |
145 mov v5, a3, asr #\shift | 145 mov r8, r2, asr #\shift |
146 | 146 |
147 sub a4, v2, v6 /* a4 = A1 + B1 */ | 147 sub r3, r5, r9 /* r3 = A1 + B1 */ |
148 add a3, v2, v6 /* a3 = A1 - B1 */ | 148 add r2, r5, r9 /* r2 = A1 - B1 */ |
149 mov v2, a4, asr #\shift | 149 mov r5, r3, asr #\shift |
150 mov v6, a3, asr #\shift | 150 mov r9, r2, asr #\shift |
151 | 151 |
152 add a4, v3, v7 /* a4 = A2 + B2 */ | 152 add r3, r6, r10 /* r3 = A2 + B2 */ |
153 sub a3, v3, v7 /* a3 = A2 - B2 */ | 153 sub r2, r6, r10 /* r2 = A2 - B2 */ |
154 mov v3, a4, asr #\shift | 154 mov r6, r3, asr #\shift |
155 mov v7, a3, asr #\shift | 155 mov r10,r2, asr #\shift |
156 | 156 |
157 add a4, v4, fp /* a4 = A3 + B3 */ | 157 add r3, r7, r11 /* r3 = A3 + B3 */ |
158 sub a3, v4, fp /* a3 = A3 - B3 */ | 158 sub r2, r7, r11 /* r2 = A3 - B3 */ |
159 mov v4, a4, asr #\shift | 159 mov r7, r3, asr #\shift |
160 mov fp, a3, asr #\shift | 160 mov r11,r2, asr #\shift |
161 .endm | 161 .endm |
162 | 162 |
163 /* | 163 /* |
164 Compute final part of IDCT single row, saturating results at 8 bits. | 164 Compute final part of IDCT single row, saturating results at 8 bits. |
165 shift = right-shift amount | 165 shift = right-shift amount |
166 Input/output in registers v1--v8 | 166 Input/output in registers r4--r11 |
167 */ | 167 */ |
168 .macro idct_finish_shift_sat shift | 168 .macro idct_finish_shift_sat shift |
169 add a4, v1, v5 /* a4 = A0 + B0 */ | 169 add r3, r4, r8 /* r3 = A0 + B0 */ |
170 sub ip, v1, v5 /* ip = A0 - B0 */ | 170 sub ip, r4, r8 /* ip = A0 - B0 */ |
171 usat v1, #8, a4, asr #\shift | 171 usat r4, #8, r3, asr #\shift |
172 usat v5, #8, ip, asr #\shift | 172 usat r8, #8, ip, asr #\shift |
173 | 173 |
174 sub a4, v2, v6 /* a4 = A1 + B1 */ | 174 sub r3, r5, r9 /* r3 = A1 + B1 */ |
175 add ip, v2, v6 /* ip = A1 - B1 */ | 175 add ip, r5, r9 /* ip = A1 - B1 */ |
176 usat v2, #8, a4, asr #\shift | 176 usat r5, #8, r3, asr #\shift |
177 usat v6, #8, ip, asr #\shift | 177 usat r9, #8, ip, asr #\shift |
178 | 178 |
179 add a4, v3, v7 /* a4 = A2 + B2 */ | 179 add r3, r6, r10 /* r3 = A2 + B2 */ |
180 sub ip, v3, v7 /* ip = A2 - B2 */ | 180 sub ip, r6, r10 /* ip = A2 - B2 */ |
181 usat v3, #8, a4, asr #\shift | 181 usat r6, #8, r3, asr #\shift |
182 usat v7, #8, ip, asr #\shift | 182 usat r10,#8, ip, asr #\shift |
183 | 183 |
184 add a4, v4, fp /* a4 = A3 + B3 */ | 184 add r3, r7, r11 /* r3 = A3 + B3 */ |
185 sub ip, v4, fp /* ip = A3 - B3 */ | 185 sub ip, r7, r11 /* ip = A3 - B3 */ |
186 usat v4, #8, a4, asr #\shift | 186 usat r7, #8, r3, asr #\shift |
187 usat fp, #8, ip, asr #\shift | 187 usat r11,#8, ip, asr #\shift |
188 .endm | 188 .endm |
189 | 189 |
190 /* | 190 /* |
191 Compute IDCT of single row, storing as column. | 191 Compute IDCT of single row, storing as column. |
192 a1 = source | 192 r0 = source |
193 a2 = dest | 193 r1 = dest |
194 */ | 194 */ |
195 function idct_row_armv6 | 195 function idct_row_armv6 |
196 str lr, [sp, #-4]! | 196 str lr, [sp, #-4]! |
197 | 197 |
198 ldr lr, [a1, #12] /* lr = row[7,5] */ | 198 ldr lr, [r0, #12] /* lr = row[7,5] */ |
199 ldr ip, [a1, #4] /* ip = row[6,4] */ | 199 ldr ip, [r0, #4] /* ip = row[6,4] */ |
200 ldr a4, [a1, #8] /* a4 = row[3,1] */ | 200 ldr r3, [r0, #8] /* r3 = row[3,1] */ |
201 ldr a3, [a1] /* a3 = row[2,0] */ | 201 ldr r2, [r0] /* r2 = row[2,0] */ |
202 orrs lr, lr, ip | 202 orrs lr, lr, ip |
203 cmpeq lr, a4 | 203 cmpeq lr, r3 |
204 cmpeq lr, a3, lsr #16 | 204 cmpeq lr, r2, lsr #16 |
205 beq 1f | 205 beq 1f |
206 str a2, [sp, #-4]! | 206 str r1, [sp, #-4]! |
207 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ | 207 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ |
208 cmp lr, #0 | 208 cmp lr, #0 |
209 beq 2f | 209 beq 2f |
210 | 210 |
211 idct_row ROW_SHIFT | 211 idct_row ROW_SHIFT |
212 b 3f | 212 b 3f |
213 | 213 |
214 2: idct_row4 ROW_SHIFT | 214 2: idct_row4 ROW_SHIFT |
215 | 215 |
216 3: ldr a2, [sp], #4 | 216 3: ldr r1, [sp], #4 |
217 idct_finish_shift ROW_SHIFT | 217 idct_finish_shift ROW_SHIFT |
218 | 218 |
219 strh v1, [a2] | 219 strh r4, [r1] |
220 strh v2, [a2, #(16*2)] | 220 strh r5, [r1, #(16*2)] |
221 strh v3, [a2, #(16*4)] | 221 strh r6, [r1, #(16*4)] |
222 strh v4, [a2, #(16*6)] | 222 strh r7, [r1, #(16*6)] |
223 strh fp, [a2, #(16*1)] | 223 strh r11,[r1, #(16*1)] |
224 strh v7, [a2, #(16*3)] | 224 strh r10,[r1, #(16*3)] |
225 strh v6, [a2, #(16*5)] | 225 strh r9, [r1, #(16*5)] |
226 strh v5, [a2, #(16*7)] | 226 strh r8, [r1, #(16*7)] |
227 | 227 |
228 ldr pc, [sp], #4 | 228 ldr pc, [sp], #4 |
229 | 229 |
230 1: mov a3, a3, lsl #3 | 230 1: mov r2, r2, lsl #3 |
231 strh a3, [a2] | 231 strh r2, [r1] |
232 strh a3, [a2, #(16*2)] | 232 strh r2, [r1, #(16*2)] |
233 strh a3, [a2, #(16*4)] | 233 strh r2, [r1, #(16*4)] |
234 strh a3, [a2, #(16*6)] | 234 strh r2, [r1, #(16*6)] |
235 strh a3, [a2, #(16*1)] | 235 strh r2, [r1, #(16*1)] |
236 strh a3, [a2, #(16*3)] | 236 strh r2, [r1, #(16*3)] |
237 strh a3, [a2, #(16*5)] | 237 strh r2, [r1, #(16*5)] |
238 strh a3, [a2, #(16*7)] | 238 strh r2, [r1, #(16*7)] |
239 ldr pc, [sp], #4 | 239 ldr pc, [sp], #4 |
240 .endfunc | 240 .endfunc |
241 | 241 |
242 /* | 242 /* |
243 Compute IDCT of single column, read as row. | 243 Compute IDCT of single column, read as row. |
244 a1 = source | 244 r0 = source |
245 a2 = dest | 245 r1 = dest |
246 */ | 246 */ |
247 function idct_col_armv6 | 247 function idct_col_armv6 |
248 stmfd sp!, {a2, lr} | 248 stmfd sp!, {r1, lr} |
249 | 249 |
250 ldr a3, [a1] /* a3 = row[2,0] */ | 250 ldr r2, [r0] /* r2 = row[2,0] */ |
251 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ | 251 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ |
252 ldr a4, [a1, #8] /* a4 = row[3,1] */ | 252 ldr r3, [r0, #8] /* r3 = row[3,1] */ |
253 idct_row COL_SHIFT | 253 idct_row COL_SHIFT |
254 ldr a2, [sp], #4 | 254 ldr r1, [sp], #4 |
255 idct_finish_shift COL_SHIFT | 255 idct_finish_shift COL_SHIFT |
256 | 256 |
257 strh v1, [a2] | 257 strh r4, [r1] |
258 strh v2, [a2, #(16*1)] | 258 strh r5, [r1, #(16*1)] |
259 strh v3, [a2, #(16*2)] | 259 strh r6, [r1, #(16*2)] |
260 strh v4, [a2, #(16*3)] | 260 strh r7, [r1, #(16*3)] |
261 strh fp, [a2, #(16*4)] | 261 strh r11,[r1, #(16*4)] |
262 strh v7, [a2, #(16*5)] | 262 strh r10,[r1, #(16*5)] |
263 strh v6, [a2, #(16*6)] | 263 strh r9, [r1, #(16*6)] |
264 strh v5, [a2, #(16*7)] | 264 strh r8, [r1, #(16*7)] |
265 | 265 |
266 ldr pc, [sp], #4 | 266 ldr pc, [sp], #4 |
267 .endfunc | 267 .endfunc |
268 | 268 |
269 /* | 269 /* |
270 Compute IDCT of single column, read as row, store saturated 8-bit. | 270 Compute IDCT of single column, read as row, store saturated 8-bit. |
271 a1 = source | 271 r0 = source |
272 a2 = dest | 272 r1 = dest |
273 a3 = line size | 273 r2 = line size |
274 */ | 274 */ |
275 function idct_col_put_armv6 | 275 function idct_col_put_armv6 |
276 stmfd sp!, {a2, a3, lr} | 276 stmfd sp!, {r1, r2, lr} |
277 | 277 |
278 ldr a3, [a1] /* a3 = row[2,0] */ | 278 ldr r2, [r0] /* r2 = row[2,0] */ |
279 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ | 279 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ |
280 ldr a4, [a1, #8] /* a4 = row[3,1] */ | 280 ldr r3, [r0, #8] /* r3 = row[3,1] */ |
281 idct_row COL_SHIFT | 281 idct_row COL_SHIFT |
282 ldmfd sp!, {a2, a3} | 282 ldmfd sp!, {r1, r2} |
283 idct_finish_shift_sat COL_SHIFT | 283 idct_finish_shift_sat COL_SHIFT |
284 | 284 |
285 strb v1, [a2], a3 | 285 strb r4, [r1], r2 |
286 strb v2, [a2], a3 | 286 strb r5, [r1], r2 |
287 strb v3, [a2], a3 | 287 strb r6, [r1], r2 |
288 strb v4, [a2], a3 | 288 strb r7, [r1], r2 |
289 strb fp, [a2], a3 | 289 strb r11,[r1], r2 |
290 strb v7, [a2], a3 | 290 strb r10,[r1], r2 |
291 strb v6, [a2], a3 | 291 strb r9, [r1], r2 |
292 strb v5, [a2], a3 | 292 strb r8, [r1], r2 |
293 | 293 |
294 sub a2, a2, a3, lsl #3 | 294 sub r1, r1, r2, lsl #3 |
295 | 295 |
296 ldr pc, [sp], #4 | 296 ldr pc, [sp], #4 |
297 .endfunc | 297 .endfunc |
298 | 298 |
299 /* | 299 /* |
300 Compute IDCT of single column, read as row, add/store saturated 8-bit. | 300 Compute IDCT of single column, read as row, add/store saturated 8-bit. |
301 a1 = source | 301 r0 = source |
302 a2 = dest | 302 r1 = dest |
303 a3 = line size | 303 r2 = line size |
304 */ | 304 */ |
305 function idct_col_add_armv6 | 305 function idct_col_add_armv6 |
306 stmfd sp!, {a2, a3, lr} | 306 stmfd sp!, {r1, r2, lr} |
307 | 307 |
308 ldr a3, [a1] /* a3 = row[2,0] */ | 308 ldr r2, [r0] /* r2 = row[2,0] */ |
309 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ | 309 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ |
310 ldr a4, [a1, #8] /* a4 = row[3,1] */ | 310 ldr r3, [r0, #8] /* r3 = row[3,1] */ |
311 idct_row COL_SHIFT | 311 idct_row COL_SHIFT |
312 ldmfd sp!, {a2, a3} | 312 ldmfd sp!, {r1, r2} |
313 idct_finish | 313 idct_finish |
314 | 314 |
315 ldrb a4, [a2] | 315 ldrb r3, [r1] |
316 ldrb v4, [a2, a3] | 316 ldrb r7, [r1, r2] |
317 ldrb fp, [a2, a3, lsl #2] | 317 ldrb r11,[r1, r2, lsl #2] |
318 add ip, a4, ip, asr #COL_SHIFT | 318 add ip, r3, ip, asr #COL_SHIFT |
319 usat ip, #8, ip | 319 usat ip, #8, ip |
320 add v1, v4, v1, asr #COL_SHIFT | 320 add r4, r7, r4, asr #COL_SHIFT |
321 strb ip, [a2], a3 | 321 strb ip, [r1], r2 |
322 ldrb ip, [a2, a3] | 322 ldrb ip, [r1, r2] |
323 usat v1, #8, v1 | 323 usat r4, #8, r4 |
324 ldrb fp, [a2, a3, lsl #2] | 324 ldrb r11,[r1, r2, lsl #2] |
325 add v2, ip, v2, asr #COL_SHIFT | 325 add r5, ip, r5, asr #COL_SHIFT |
326 usat v2, #8, v2 | 326 usat r5, #8, r5 |
327 strb v1, [a2], a3 | 327 strb r4, [r1], r2 |
328 ldrb a4, [a2, a3] | 328 ldrb r3, [r1, r2] |
329 ldrb ip, [a2, a3, lsl #2] | 329 ldrb ip, [r1, r2, lsl #2] |
330 strb v2, [a2], a3 | 330 strb r5, [r1], r2 |
331 ldrb v4, [a2, a3] | 331 ldrb r7, [r1, r2] |
332 ldrb v1, [a2, a3, lsl #2] | 332 ldrb r4, [r1, r2, lsl #2] |
333 add v3, a4, v3, asr #COL_SHIFT | 333 add r6, r3, r6, asr #COL_SHIFT |
334 usat v3, #8, v3 | 334 usat r6, #8, r6 |
335 add v7, v4, v7, asr #COL_SHIFT | 335 add r10,r7, r10,asr #COL_SHIFT |
336 usat v7, #8, v7 | 336 usat r10,#8, r10 |
337 add v6, fp, v6, asr #COL_SHIFT | 337 add r9, r11,r9, asr #COL_SHIFT |
338 usat v6, #8, v6 | 338 usat r9, #8, r9 |
339 add v5, ip, v5, asr #COL_SHIFT | 339 add r8, ip, r8, asr #COL_SHIFT |
340 usat v5, #8, v5 | 340 usat r8, #8, r8 |
341 add lr, v1, lr, asr #COL_SHIFT | 341 add lr, r4, lr, asr #COL_SHIFT |
342 usat lr, #8, lr | 342 usat lr, #8, lr |
343 strb v3, [a2], a3 | 343 strb r6, [r1], r2 |
344 strb v7, [a2], a3 | 344 strb r10,[r1], r2 |
345 strb v6, [a2], a3 | 345 strb r9, [r1], r2 |
346 strb v5, [a2], a3 | 346 strb r8, [r1], r2 |
347 strb lr, [a2], a3 | 347 strb lr, [r1], r2 |
348 | 348 |
349 sub a2, a2, a3, lsl #3 | 349 sub r1, r1, r2, lsl #3 |
350 | 350 |
351 ldr pc, [sp], #4 | 351 ldr pc, [sp], #4 |
352 .endfunc | 352 .endfunc |
353 | 353 |
354 /* | 354 /* |
356 func = IDCT row->col function | 356 func = IDCT row->col function |
357 width = width of columns in bytes | 357 width = width of columns in bytes |
358 */ | 358 */ |
359 .macro idct_rows func width | 359 .macro idct_rows func width |
360 bl \func | 360 bl \func |
361 add a1, a1, #(16*2) | 361 add r0, r0, #(16*2) |
362 add a2, a2, #\width | 362 add r1, r1, #\width |
363 bl \func | 363 bl \func |
364 add a1, a1, #(16*2) | 364 add r0, r0, #(16*2) |
365 add a2, a2, #\width | 365 add r1, r1, #\width |
366 bl \func | 366 bl \func |
367 add a1, a1, #(16*2) | 367 add r0, r0, #(16*2) |
368 add a2, a2, #\width | 368 add r1, r1, #\width |
369 bl \func | 369 bl \func |
370 sub a1, a1, #(16*5) | 370 sub r0, r0, #(16*5) |
371 add a2, a2, #\width | 371 add r1, r1, #\width |
372 bl \func | 372 bl \func |
373 add a1, a1, #(16*2) | 373 add r0, r0, #(16*2) |
374 add a2, a2, #\width | 374 add r1, r1, #\width |
375 bl \func | 375 bl \func |
376 add a1, a1, #(16*2) | 376 add r0, r0, #(16*2) |
377 add a2, a2, #\width | 377 add r1, r1, #\width |
378 bl \func | 378 bl \func |
379 add a1, a1, #(16*2) | 379 add r0, r0, #(16*2) |
380 add a2, a2, #\width | 380 add r1, r1, #\width |
381 bl \func | 381 bl \func |
382 | 382 |
383 sub a1, a1, #(16*7) | 383 sub r0, r0, #(16*7) |
384 .endm | 384 .endm |
385 | 385 |
386 /* void ff_simple_idct_armv6(DCTELEM *data); */ | 386 /* void ff_simple_idct_armv6(DCTELEM *data); */ |
387 function ff_simple_idct_armv6, export=1 | 387 function ff_simple_idct_armv6, export=1 |
388 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} | 388 stmfd sp!, {r4-r11, lr} |
389 sub sp, sp, #128 | 389 sub sp, sp, #128 |
390 | 390 |
391 mov a2, sp | 391 mov r1, sp |
392 idct_rows idct_row_armv6, 2 | 392 idct_rows idct_row_armv6, 2 |
393 mov a2, a1 | 393 mov r1, r0 |
394 mov a1, sp | 394 mov r0, sp |
395 idct_rows idct_col_armv6, 2 | 395 idct_rows idct_col_armv6, 2 |
396 | 396 |
397 add sp, sp, #128 | 397 add sp, sp, #128 |
398 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} | 398 ldmfd sp!, {r4-r11, pc} |
399 .endfunc | 399 .endfunc |
400 | 400 |
401 /* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ | 401 /* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ |
402 function ff_simple_idct_add_armv6, export=1 | 402 function ff_simple_idct_add_armv6, export=1 |
403 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} | 403 stmfd sp!, {r0, r1, r4-r11, lr} |
404 sub sp, sp, #128 | 404 sub sp, sp, #128 |
405 | 405 |
406 mov a1, a3 | 406 mov r0, r2 |
407 mov a2, sp | 407 mov r1, sp |
408 idct_rows idct_row_armv6, 2 | 408 idct_rows idct_row_armv6, 2 |
409 mov a1, sp | 409 mov r0, sp |
410 ldr a2, [sp, #128] | 410 ldr r1, [sp, #128] |
411 ldr a3, [sp, #(128+4)] | 411 ldr r2, [sp, #(128+4)] |
412 idct_rows idct_col_add_armv6, 1 | 412 idct_rows idct_col_add_armv6, 1 |
413 | 413 |
414 add sp, sp, #(128+8) | 414 add sp, sp, #(128+8) |
415 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} | 415 ldmfd sp!, {r4-r11, pc} |
416 .endfunc | 416 .endfunc |
417 | 417 |
418 /* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ | 418 /* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ |
419 function ff_simple_idct_put_armv6, export=1 | 419 function ff_simple_idct_put_armv6, export=1 |
420 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} | 420 stmfd sp!, {r0, r1, r4-r11, lr} |
421 sub sp, sp, #128 | 421 sub sp, sp, #128 |
422 | 422 |
423 mov a1, a3 | 423 mov r0, r2 |
424 mov a2, sp | 424 mov r1, sp |
425 idct_rows idct_row_armv6, 2 | 425 idct_rows idct_row_armv6, 2 |
426 mov a1, sp | 426 mov r0, sp |
427 ldr a2, [sp, #128] | 427 ldr r1, [sp, #128] |
428 ldr a3, [sp, #(128+4)] | 428 ldr r2, [sp, #(128+4)] |
429 idct_rows idct_col_put_armv6, 1 | 429 idct_rows idct_col_put_armv6, 1 |
430 | 430 |
431 add sp, sp, #(128+8) | 431 add sp, sp, #(128+8) |
432 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} | 432 ldmfd sp!, {r4-r11, pc} |
433 .endfunc | 433 .endfunc |