Mercurial > libavcodec.hg
annotate alpha/motion_est_alpha.c @ 12483:0159a19bfff7 libavcodec
aacdec: Rework channel mapping compatibility hacks.
For a PCE based configuration map the channels solely based on tags.
For an indexed configuration map the channels solely based on position.
This works with all known exotic samples including al17, elem_id0, bad_concat,
and lfe_is_sce.
author | alexc |
---|---|
date | Fri, 10 Sep 2010 18:01:48 +0000 |
parents | 9e7d38743146 |
children |
rev | line source |
---|---|
586 | 1 /* |
2 * Alpha optimized DSP utils | |
3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org> | |
4 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
5 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
6 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
7 * FFmpeg is free software; you can redistribute it and/or |
586 | 8 * modify it under the terms of the GNU Lesser General Public |
9 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
10 * version 2.1 of the License, or (at your option) any later version. |
586 | 11 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
12 * FFmpeg is distributed in the hope that it will be useful, |
586 | 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
18 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
586 | 20 */ |
21 | |
6763 | 22 #include "libavcodec/dsputil.h" |
11396 | 23 #include "dsputil_alpha.h" |
586 | 24 #include "asm.h" |
25 | |
26 void get_pixels_mvi(DCTELEM *restrict block, | |
27 const uint8_t *restrict pixels, int line_size) | |
28 { | |
29 int h = 8; | |
30 | |
31 do { | |
32 uint64_t p; | |
33 | |
34 p = ldq(pixels); | |
35 stq(unpkbw(p), block); | |
2967 | 36 stq(unpkbw(p >> 32), block + 4); |
586 | 37 |
38 pixels += line_size; | |
39 block += 8; | |
40 } while (--h); | |
41 } | |
42 | |
43 void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, | |
44 int stride) { | |
45 int h = 8; | |
46 uint64_t mask = 0x4040; | |
47 | |
48 mask |= mask << 16; | |
49 mask |= mask << 32; | |
50 do { | |
51 uint64_t x, y, c, d, a; | |
52 uint64_t signs; | |
53 | |
54 x = ldq(s1); | |
55 y = ldq(s2); | |
56 c = cmpbge(x, y); | |
57 d = x - y; | |
58 a = zap(mask, c); /* We use 0x4040404040404040 here... */ | |
59 d += 4 * a; /* ...so we can use s4addq here. */ | |
60 signs = zap(-1, c); | |
61 | |
62 stq(unpkbw(d) | (unpkbw(signs) << 8), block); | |
63 stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4); | |
64 | |
65 s1 += stride; | |
66 s2 += stride; | |
67 block += 8; | |
68 } while (--h); | |
69 } | |
70 | |
71 static inline uint64_t avg2(uint64_t a, uint64_t b) | |
72 { | |
73 return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); | |
74 } | |
75 | |
76 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) | |
77 { | |
78 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) | |
79 + ((l2 & ~BYTE_VEC(0x03)) >> 2) | |
80 + ((l3 & ~BYTE_VEC(0x03)) >> 2) | |
81 + ((l4 & ~BYTE_VEC(0x03)) >> 2); | |
82 uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) | |
83 + (l2 & BYTE_VEC(0x03)) | |
84 + (l3 & BYTE_VEC(0x03)) | |
85 + (l4 & BYTE_VEC(0x03)) | |
86 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); | |
87 return r1 + r2; | |
88 } | |
89 | |
1708 | 90 int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
586 | 91 { |
92 int result = 0; | |
93 | |
94 if ((size_t) pix2 & 0x7) { | |
95 /* works only when pix2 is actually unaligned */ | |
96 do { /* do 8 pixel a time */ | |
97 uint64_t p1, p2; | |
98 | |
99 p1 = ldq(pix1); | |
100 p2 = uldq(pix2); | |
101 result += perr(p1, p2); | |
102 | |
103 pix1 += line_size; | |
104 pix2 += line_size; | |
105 } while (--h); | |
106 } else { | |
107 do { | |
108 uint64_t p1, p2; | |
109 | |
110 p1 = ldq(pix1); | |
111 p2 = ldq(pix2); | |
112 result += perr(p1, p2); | |
113 | |
114 pix1 += line_size; | |
115 pix2 += line_size; | |
116 } while (--h); | |
117 } | |
118 | |
119 return result; | |
120 } | |
121 | |
2979 | 122 #if 0 /* now done in assembly */ |
586 | 123 int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) |
124 { | |
125 int result = 0; | |
126 int h = 16; | |
127 | |
128 if ((size_t) pix2 & 0x7) { | |
129 /* works only when pix2 is actually unaligned */ | |
130 do { /* do 16 pixel a time */ | |
131 uint64_t p1_l, p1_r, p2_l, p2_r; | |
132 uint64_t t; | |
133 | |
134 p1_l = ldq(pix1); | |
135 p1_r = ldq(pix1 + 8); | |
136 t = ldq_u(pix2 + 8); | |
137 p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); | |
138 p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); | |
139 pix1 += line_size; | |
140 pix2 += line_size; | |
141 | |
142 result += perr(p1_l, p2_l) | |
143 + perr(p1_r, p2_r); | |
144 } while (--h); | |
145 } else { | |
146 do { | |
147 uint64_t p1_l, p1_r, p2_l, p2_r; | |
148 | |
149 p1_l = ldq(pix1); | |
150 p1_r = ldq(pix1 + 8); | |
151 p2_l = ldq(pix2); | |
152 p2_r = ldq(pix2 + 8); | |
153 pix1 += line_size; | |
154 pix2 += line_size; | |
155 | |
156 result += perr(p1_l, p2_l) | |
157 + perr(p1_r, p2_r); | |
158 } while (--h); | |
159 } | |
160 | |
161 return result; | |
162 } | |
705
107a56aa74f5
Add Alpha assembly for pix_abs16x16. Optimized for pca56, no large win
mellum
parents:
586
diff
changeset
|
163 #endif |
586 | 164 |
1708 | 165 int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
586 | 166 { |
167 int result = 0; | |
168 uint64_t disalign = (size_t) pix2 & 0x7; | |
169 | |
170 switch (disalign) { | |
171 case 0: | |
172 do { | |
173 uint64_t p1_l, p1_r, p2_l, p2_r; | |
174 uint64_t l, r; | |
175 | |
176 p1_l = ldq(pix1); | |
177 p1_r = ldq(pix1 + 8); | |
178 l = ldq(pix2); | |
179 r = ldq(pix2 + 8); | |
180 p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56)); | |
181 p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56)); | |
182 pix1 += line_size; | |
183 pix2 += line_size; | |
184 | |
185 result += perr(p1_l, p2_l) | |
186 + perr(p1_r, p2_r); | |
187 } while (--h); | |
188 break; | |
189 case 7: | |
190 /* |.......l|lllllllr|rrrrrrr*| | |
191 This case is special because disalign1 would be 8, which | |
192 gets treated as 0 by extqh. At least it is a bit faster | |
2967 | 193 that way :) */ |
586 | 194 do { |
195 uint64_t p1_l, p1_r, p2_l, p2_r; | |
196 uint64_t l, m, r; | |
197 | |
198 p1_l = ldq(pix1); | |
199 p1_r = ldq(pix1 + 8); | |
200 l = ldq_u(pix2); | |
201 m = ldq_u(pix2 + 8); | |
202 r = ldq_u(pix2 + 16); | |
203 p2_l = avg2(extql(l, disalign) | extqh(m, disalign), m); | |
204 p2_r = avg2(extql(m, disalign) | extqh(r, disalign), r); | |
205 pix1 += line_size; | |
206 pix2 += line_size; | |
2967 | 207 |
586 | 208 result += perr(p1_l, p2_l) |
209 + perr(p1_r, p2_r); | |
210 } while (--h); | |
211 break; | |
212 default: | |
213 do { | |
214 uint64_t disalign1 = disalign + 1; | |
215 uint64_t p1_l, p1_r, p2_l, p2_r; | |
216 uint64_t l, m, r; | |
217 | |
218 p1_l = ldq(pix1); | |
219 p1_r = ldq(pix1 + 8); | |
220 l = ldq_u(pix2); | |
221 m = ldq_u(pix2 + 8); | |
222 r = ldq_u(pix2 + 16); | |
223 p2_l = avg2(extql(l, disalign) | extqh(m, disalign), | |
224 extql(l, disalign1) | extqh(m, disalign1)); | |
225 p2_r = avg2(extql(m, disalign) | extqh(r, disalign), | |
226 extql(m, disalign1) | extqh(r, disalign1)); | |
227 pix1 += line_size; | |
228 pix2 += line_size; | |
229 | |
230 result += perr(p1_l, p2_l) | |
231 + perr(p1_r, p2_r); | |
232 } while (--h); | |
233 break; | |
234 } | |
235 return result; | |
236 } | |
237 | |
1708 | 238 int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
586 | 239 { |
240 int result = 0; | |
241 | |
242 if ((size_t) pix2 & 0x7) { | |
243 uint64_t t, p2_l, p2_r; | |
244 t = ldq_u(pix2 + 8); | |
245 p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); | |
246 p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); | |
247 | |
248 do { | |
249 uint64_t p1_l, p1_r, np2_l, np2_r; | |
250 uint64_t t; | |
251 | |
252 p1_l = ldq(pix1); | |
253 p1_r = ldq(pix1 + 8); | |
254 pix2 += line_size; | |
255 t = ldq_u(pix2 + 8); | |
256 np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); | |
257 np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); | |
258 | |
259 result += perr(p1_l, avg2(p2_l, np2_l)) | |
260 + perr(p1_r, avg2(p2_r, np2_r)); | |
261 | |
262 pix1 += line_size; | |
263 p2_l = np2_l; | |
264 p2_r = np2_r; | |
265 | |
266 } while (--h); | |
267 } else { | |
268 uint64_t p2_l, p2_r; | |
269 p2_l = ldq(pix2); | |
270 p2_r = ldq(pix2 + 8); | |
271 do { | |
272 uint64_t p1_l, p1_r, np2_l, np2_r; | |
273 | |
274 p1_l = ldq(pix1); | |
275 p1_r = ldq(pix1 + 8); | |
276 pix2 += line_size; | |
277 np2_l = ldq(pix2); | |
278 np2_r = ldq(pix2 + 8); | |
279 | |
280 result += perr(p1_l, avg2(p2_l, np2_l)) | |
281 + perr(p1_r, avg2(p2_r, np2_r)); | |
282 | |
283 pix1 += line_size; | |
284 p2_l = np2_l; | |
285 p2_r = np2_r; | |
286 } while (--h); | |
287 } | |
288 return result; | |
289 } | |
290 | |
1708 | 291 int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
586 | 292 { |
293 int result = 0; | |
2967 | 294 |
586 | 295 uint64_t p1_l, p1_r; |
296 uint64_t p2_l, p2_r, p2_x; | |
297 | |
298 p1_l = ldq(pix1); | |
299 p1_r = ldq(pix1 + 8); | |
300 | |
301 if ((size_t) pix2 & 0x7) { /* could be optimized a lot */ | |
302 p2_l = uldq(pix2); | |
303 p2_r = uldq(pix2 + 8); | |
304 p2_x = (uint64_t) pix2[16] << 56; | |
305 } else { | |
306 p2_l = ldq(pix2); | |
307 p2_r = ldq(pix2 + 8); | |
308 p2_x = ldq(pix2 + 16) << 56; | |
309 } | |
310 | |
311 do { | |
312 uint64_t np1_l, np1_r; | |
313 uint64_t np2_l, np2_r, np2_x; | |
314 | |
315 pix1 += line_size; | |
316 pix2 += line_size; | |
317 | |
318 np1_l = ldq(pix1); | |
319 np1_r = ldq(pix1 + 8); | |
320 | |
321 if ((size_t) pix2 & 0x7) { /* could be optimized a lot */ | |
322 np2_l = uldq(pix2); | |
323 np2_r = uldq(pix2 + 8); | |
324 np2_x = (uint64_t) pix2[16] << 56; | |
325 } else { | |
326 np2_l = ldq(pix2); | |
327 np2_r = ldq(pix2 + 8); | |
328 np2_x = ldq(pix2 + 16) << 56; | |
329 } | |
330 | |
331 result += perr(p1_l, | |
332 avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56), | |
333 np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56))) | |
334 + perr(p1_r, | |
335 avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x), | |
336 np2_r, (np2_r >> 8) | ((uint64_t) np2_x))); | |
337 | |
338 p1_l = np1_l; | |
339 p1_r = np1_r; | |
340 p2_l = np2_l; | |
341 p2_r = np2_r; | |
342 p2_x = np2_x; | |
343 } while (--h); | |
344 | |
345 return result; | |
346 } |