Mercurial > libavcodec.hg
annotate alpha/motion_est_alpha.c @ 12518:67e7e49058c2 libavcodec
Split and then simplify address generation macro.
Allows nasm to work for this code.
author | reimar |
---|---|
date | Sun, 26 Sep 2010 09:08:11 +0000 |
parents | 9e7d38743146 |
children |
rev | line source |
---|---|
586 | 1 /* |
2 * Alpha optimized DSP utils | |
3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org> | |
4 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
5 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
6 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
7 * FFmpeg is free software; you can redistribute it and/or |
586 | 8 * modify it under the terms of the GNU Lesser General Public |
9 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
10 * version 2.1 of the License, or (at your option) any later version. |
586 | 11 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
12 * FFmpeg is distributed in the hope that it will be useful, |
586 | 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
18 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
586 | 20 */ |
21 | |
6763 | 22 #include "libavcodec/dsputil.h" |
11396 | 23 #include "dsputil_alpha.h" |
586 | 24 #include "asm.h" |
25 | |
26 void get_pixels_mvi(DCTELEM *restrict block, | |
27 const uint8_t *restrict pixels, int line_size) | |
28 { | |
29 int h = 8; | |
30 | |
31 do { | |
32 uint64_t p; | |
33 | |
34 p = ldq(pixels); | |
35 stq(unpkbw(p), block); | |
2967 | 36 stq(unpkbw(p >> 32), block + 4); |
586 | 37 |
38 pixels += line_size; | |
39 block += 8; | |
40 } while (--h); | |
41 } | |
42 | |
43 void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, | |
44 int stride) { | |
45 int h = 8; | |
46 uint64_t mask = 0x4040; | |
47 | |
48 mask |= mask << 16; | |
49 mask |= mask << 32; | |
50 do { | |
51 uint64_t x, y, c, d, a; | |
52 uint64_t signs; | |
53 | |
54 x = ldq(s1); | |
55 y = ldq(s2); | |
56 c = cmpbge(x, y); | |
57 d = x - y; | |
58 a = zap(mask, c); /* We use 0x4040404040404040 here... */ | |
59 d += 4 * a; /* ...so we can use s4addq here. */ | |
60 signs = zap(-1, c); | |
61 | |
62 stq(unpkbw(d) | (unpkbw(signs) << 8), block); | |
63 stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4); | |
64 | |
65 s1 += stride; | |
66 s2 += stride; | |
67 block += 8; | |
68 } while (--h); | |
69 } | |
70 | |
71 static inline uint64_t avg2(uint64_t a, uint64_t b) | |
72 { | |
73 return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); | |
74 } | |
75 | |
76 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) | |
77 { | |
78 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) | |
79 + ((l2 & ~BYTE_VEC(0x03)) >> 2) | |
80 + ((l3 & ~BYTE_VEC(0x03)) >> 2) | |
81 + ((l4 & ~BYTE_VEC(0x03)) >> 2); | |
82 uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) | |
83 + (l2 & BYTE_VEC(0x03)) | |
84 + (l3 & BYTE_VEC(0x03)) | |
85 + (l4 & BYTE_VEC(0x03)) | |
86 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); | |
87 return r1 + r2; | |
88 } | |
89 | |
1708 | 90 int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
586 | 91 { |
92 int result = 0; | |
93 | |
94 if ((size_t) pix2 & 0x7) { | |
95 /* works only when pix2 is actually unaligned */ | |
96 do { /* do 8 pixel a time */ | |
97 uint64_t p1, p2; | |
98 | |
99 p1 = ldq(pix1); | |
100 p2 = uldq(pix2); | |
101 result += perr(p1, p2); | |
102 | |
103 pix1 += line_size; | |
104 pix2 += line_size; | |
105 } while (--h); | |
106 } else { | |
107 do { | |
108 uint64_t p1, p2; | |
109 | |
110 p1 = ldq(pix1); | |
111 p2 = ldq(pix2); | |
112 result += perr(p1, p2); | |
113 | |
114 pix1 += line_size; | |
115 pix2 += line_size; | |
116 } while (--h); | |
117 } | |
118 | |
119 return result; | |
120 } | |
121 | |
2979 | 122 #if 0 /* now done in assembly */ |
586 | 123 int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) |
124 { | |
125 int result = 0; | |
126 int h = 16; | |
127 | |
128 if ((size_t) pix2 & 0x7) { | |
129 /* works only when pix2 is actually unaligned */ | |
130 do { /* do 16 pixel a time */ | |
131 uint64_t p1_l, p1_r, p2_l, p2_r; | |
132 uint64_t t; | |
133 | |
134 p1_l = ldq(pix1); | |
135 p1_r = ldq(pix1 + 8); | |
136 t = ldq_u(pix2 + 8); | |
137 p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); | |
138 p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); | |
139 pix1 += line_size; | |
140 pix2 += line_size; | |
141 | |
142 result += perr(p1_l, p2_l) | |
143 + perr(p1_r, p2_r); | |
144 } while (--h); | |
145 } else { | |
146 do { | |
147 uint64_t p1_l, p1_r, p2_l, p2_r; | |
148 | |
149 p1_l = ldq(pix1); | |
150 p1_r = ldq(pix1 + 8); | |
151 p2_l = ldq(pix2); | |
152 p2_r = ldq(pix2 + 8); | |
153 pix1 += line_size; | |
154 pix2 += line_size; | |
155 | |
156 result += perr(p1_l, p2_l) | |
157 + perr(p1_r, p2_r); | |
158 } while (--h); | |
159 } | |
160 | |
161 return result; | |
162 } | |
705
107a56aa74f5
Add Alpha assembly for pix_abs16x16. Optimized for pca56, no large win
mellum
parents:
586
diff
changeset
|
163 #endif |
586 | 164 |
1708 | 165 int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
586 | 166 { |
167 int result = 0; | |
168 uint64_t disalign = (size_t) pix2 & 0x7; | |
169 | |
170 switch (disalign) { | |
171 case 0: | |
172 do { | |
173 uint64_t p1_l, p1_r, p2_l, p2_r; | |
174 uint64_t l, r; | |
175 | |
176 p1_l = ldq(pix1); | |
177 p1_r = ldq(pix1 + 8); | |
178 l = ldq(pix2); | |
179 r = ldq(pix2 + 8); | |
180 p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56)); | |
181 p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56)); | |
182 pix1 += line_size; | |
183 pix2 += line_size; | |
184 | |
185 result += perr(p1_l, p2_l) | |
186 + perr(p1_r, p2_r); | |
187 } while (--h); | |
188 break; | |
189 case 7: | |
190 /* |.......l|lllllllr|rrrrrrr*| | |
191 This case is special because disalign1 would be 8, which | |
192 gets treated as 0 by extqh. At least it is a bit faster | |
2967 | 193 that way :) */ |
586 | 194 do { |
195 uint64_t p1_l, p1_r, p2_l, p2_r; | |
196 uint64_t l, m, r; | |
197 | |
198 p1_l = ldq(pix1); | |
199 p1_r = ldq(pix1 + 8); | |
200 l = ldq_u(pix2); | |
201 m = ldq_u(pix2 + 8); | |
202 r = ldq_u(pix2 + 16); | |
203 p2_l = avg2(extql(l, disalign) | extqh(m, disalign), m); | |
204 p2_r = avg2(extql(m, disalign) | extqh(r, disalign), r); | |
205 pix1 += line_size; | |
206 pix2 += line_size; | |
2967 | 207 |
586 | 208 result += perr(p1_l, p2_l) |
209 + perr(p1_r, p2_r); | |
210 } while (--h); | |
211 break; | |
212 default: | |
213 do { | |
214 uint64_t disalign1 = disalign + 1; | |
215 uint64_t p1_l, p1_r, p2_l, p2_r; | |
216 uint64_t l, m, r; | |
217 | |
218 p1_l = ldq(pix1); | |
219 p1_r = ldq(pix1 + 8); | |
220 l = ldq_u(pix2); | |
221 m = ldq_u(pix2 + 8); | |
222 r = ldq_u(pix2 + 16); | |
223 p2_l = avg2(extql(l, disalign) | extqh(m, disalign), | |
224 extql(l, disalign1) | extqh(m, disalign1)); | |
225 p2_r = avg2(extql(m, disalign) | extqh(r, disalign), | |
226 extql(m, disalign1) | extqh(r, disalign1)); | |
227 pix1 += line_size; | |
228 pix2 += line_size; | |
229 | |
230 result += perr(p1_l, p2_l) | |
231 + perr(p1_r, p2_r); | |
232 } while (--h); | |
233 break; | |
234 } | |
235 return result; | |
236 } | |
237 | |
1708 | 238 int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
586 | 239 { |
240 int result = 0; | |
241 | |
242 if ((size_t) pix2 & 0x7) { | |
243 uint64_t t, p2_l, p2_r; | |
244 t = ldq_u(pix2 + 8); | |
245 p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); | |
246 p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); | |
247 | |
248 do { | |
249 uint64_t p1_l, p1_r, np2_l, np2_r; | |
250 uint64_t t; | |
251 | |
252 p1_l = ldq(pix1); | |
253 p1_r = ldq(pix1 + 8); | |
254 pix2 += line_size; | |
255 t = ldq_u(pix2 + 8); | |
256 np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); | |
257 np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); | |
258 | |
259 result += perr(p1_l, avg2(p2_l, np2_l)) | |
260 + perr(p1_r, avg2(p2_r, np2_r)); | |
261 | |
262 pix1 += line_size; | |
263 p2_l = np2_l; | |
264 p2_r = np2_r; | |
265 | |
266 } while (--h); | |
267 } else { | |
268 uint64_t p2_l, p2_r; | |
269 p2_l = ldq(pix2); | |
270 p2_r = ldq(pix2 + 8); | |
271 do { | |
272 uint64_t p1_l, p1_r, np2_l, np2_r; | |
273 | |
274 p1_l = ldq(pix1); | |
275 p1_r = ldq(pix1 + 8); | |
276 pix2 += line_size; | |
277 np2_l = ldq(pix2); | |
278 np2_r = ldq(pix2 + 8); | |
279 | |
280 result += perr(p1_l, avg2(p2_l, np2_l)) | |
281 + perr(p1_r, avg2(p2_r, np2_r)); | |
282 | |
283 pix1 += line_size; | |
284 p2_l = np2_l; | |
285 p2_r = np2_r; | |
286 } while (--h); | |
287 } | |
288 return result; | |
289 } | |
290 | |
1708 | 291 int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
586 | 292 { |
293 int result = 0; | |
2967 | 294 |
586 | 295 uint64_t p1_l, p1_r; |
296 uint64_t p2_l, p2_r, p2_x; | |
297 | |
298 p1_l = ldq(pix1); | |
299 p1_r = ldq(pix1 + 8); | |
300 | |
301 if ((size_t) pix2 & 0x7) { /* could be optimized a lot */ | |
302 p2_l = uldq(pix2); | |
303 p2_r = uldq(pix2 + 8); | |
304 p2_x = (uint64_t) pix2[16] << 56; | |
305 } else { | |
306 p2_l = ldq(pix2); | |
307 p2_r = ldq(pix2 + 8); | |
308 p2_x = ldq(pix2 + 16) << 56; | |
309 } | |
310 | |
311 do { | |
312 uint64_t np1_l, np1_r; | |
313 uint64_t np2_l, np2_r, np2_x; | |
314 | |
315 pix1 += line_size; | |
316 pix2 += line_size; | |
317 | |
318 np1_l = ldq(pix1); | |
319 np1_r = ldq(pix1 + 8); | |
320 | |
321 if ((size_t) pix2 & 0x7) { /* could be optimized a lot */ | |
322 np2_l = uldq(pix2); | |
323 np2_r = uldq(pix2 + 8); | |
324 np2_x = (uint64_t) pix2[16] << 56; | |
325 } else { | |
326 np2_l = ldq(pix2); | |
327 np2_r = ldq(pix2 + 8); | |
328 np2_x = ldq(pix2 + 16) << 56; | |
329 } | |
330 | |
331 result += perr(p1_l, | |
332 avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56), | |
333 np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56))) | |
334 + perr(p1_r, | |
335 avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x), | |
336 np2_r, (np2_r >> 8) | ((uint64_t) np2_x))); | |
337 | |
338 p1_l = np1_l; | |
339 p1_r = np1_r; | |
340 p2_l = np2_l; | |
341 p2_r = np2_r; | |
342 p2_x = np2_x; | |
343 } while (--h); | |
344 | |
345 return result; | |
346 } |