Mercurial > libavcodec.hg
annotate alpha/motion_est_alpha.c @ 1339:338a2f6e6402 libavcodec
Mpeg2 16x8 Patch by ("Ivan Kalvachev" <ivan at cacad dot com>)
author | michaelni |
---|---|
date | Thu, 03 Jul 2003 23:29:00 +0000 |
parents | 107a56aa74f5 |
children | dea5b2946999 |
rev | line source |
---|---|
586 | 1 /* |
2 * Alpha optimized DSP utils | |
3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org> | |
4 * | |
5 * This library is free software; you can redistribute it and/or | |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2 of the License, or (at your option) any later version. | |
9 * | |
10 * This library is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 * Lesser General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU Lesser General Public | |
16 * License along with this library; if not, write to the Free Software | |
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
18 */ | |
19 | |
20 #include "asm.h" | |
21 #include "../dsputil.h" | |
22 | |
23 void get_pixels_mvi(DCTELEM *restrict block, | |
24 const uint8_t *restrict pixels, int line_size) | |
25 { | |
26 int h = 8; | |
27 | |
28 do { | |
29 uint64_t p; | |
30 | |
31 p = ldq(pixels); | |
32 stq(unpkbw(p), block); | |
33 stq(unpkbw(p >> 32), block + 4); | |
34 | |
35 pixels += line_size; | |
36 block += 8; | |
37 } while (--h); | |
38 } | |
39 | |
40 void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, | |
41 int stride) { | |
42 int h = 8; | |
43 uint64_t mask = 0x4040; | |
44 | |
45 mask |= mask << 16; | |
46 mask |= mask << 32; | |
47 do { | |
48 uint64_t x, y, c, d, a; | |
49 uint64_t signs; | |
50 | |
51 x = ldq(s1); | |
52 y = ldq(s2); | |
53 c = cmpbge(x, y); | |
54 d = x - y; | |
55 a = zap(mask, c); /* We use 0x4040404040404040 here... */ | |
56 d += 4 * a; /* ...so we can use s4addq here. */ | |
57 signs = zap(-1, c); | |
58 | |
59 stq(unpkbw(d) | (unpkbw(signs) << 8), block); | |
60 stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4); | |
61 | |
62 s1 += stride; | |
63 s2 += stride; | |
64 block += 8; | |
65 } while (--h); | |
66 } | |
67 | |
68 static inline uint64_t avg2(uint64_t a, uint64_t b) | |
69 { | |
70 return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); | |
71 } | |
72 | |
73 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) | |
74 { | |
75 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) | |
76 + ((l2 & ~BYTE_VEC(0x03)) >> 2) | |
77 + ((l3 & ~BYTE_VEC(0x03)) >> 2) | |
78 + ((l4 & ~BYTE_VEC(0x03)) >> 2); | |
79 uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) | |
80 + (l2 & BYTE_VEC(0x03)) | |
81 + (l3 & BYTE_VEC(0x03)) | |
82 + (l4 & BYTE_VEC(0x03)) | |
83 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); | |
84 return r1 + r2; | |
85 } | |
86 | |
87 int pix_abs8x8_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) | |
88 { | |
89 int result = 0; | |
90 int h = 8; | |
91 | |
92 if ((size_t) pix2 & 0x7) { | |
93 /* works only when pix2 is actually unaligned */ | |
94 do { /* do 8 pixel a time */ | |
95 uint64_t p1, p2; | |
96 | |
97 p1 = ldq(pix1); | |
98 p2 = uldq(pix2); | |
99 result += perr(p1, p2); | |
100 | |
101 pix1 += line_size; | |
102 pix2 += line_size; | |
103 } while (--h); | |
104 } else { | |
105 do { | |
106 uint64_t p1, p2; | |
107 | |
108 p1 = ldq(pix1); | |
109 p2 = ldq(pix2); | |
110 result += perr(p1, p2); | |
111 | |
112 pix1 += line_size; | |
113 pix2 += line_size; | |
114 } while (--h); | |
115 } | |
116 | |
117 return result; | |
118 } | |
119 | |
705
107a56aa74f5
Add Alpha assembly for pix_abs16x16. Optimized for pca56, no large win
mellum
parents:
586
diff
changeset
|
120 #if 0 /* now done in assembly */ |
586 | 121 int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) |
122 { | |
123 int result = 0; | |
124 int h = 16; | |
125 | |
126 if ((size_t) pix2 & 0x7) { | |
127 /* works only when pix2 is actually unaligned */ | |
128 do { /* do 16 pixel a time */ | |
129 uint64_t p1_l, p1_r, p2_l, p2_r; | |
130 uint64_t t; | |
131 | |
132 p1_l = ldq(pix1); | |
133 p1_r = ldq(pix1 + 8); | |
134 t = ldq_u(pix2 + 8); | |
135 p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); | |
136 p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); | |
137 pix1 += line_size; | |
138 pix2 += line_size; | |
139 | |
140 result += perr(p1_l, p2_l) | |
141 + perr(p1_r, p2_r); | |
142 } while (--h); | |
143 } else { | |
144 do { | |
145 uint64_t p1_l, p1_r, p2_l, p2_r; | |
146 | |
147 p1_l = ldq(pix1); | |
148 p1_r = ldq(pix1 + 8); | |
149 p2_l = ldq(pix2); | |
150 p2_r = ldq(pix2 + 8); | |
151 pix1 += line_size; | |
152 pix2 += line_size; | |
153 | |
154 result += perr(p1_l, p2_l) | |
155 + perr(p1_r, p2_r); | |
156 } while (--h); | |
157 } | |
158 | |
159 return result; | |
160 } | |
705
107a56aa74f5
Add Alpha assembly for pix_abs16x16. Optimized for pca56, no large win
mellum
parents:
586
diff
changeset
|
161 #endif |
586 | 162 |
163 int pix_abs16x16_x2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) | |
164 { | |
165 int result = 0; | |
166 int h = 16; | |
167 uint64_t disalign = (size_t) pix2 & 0x7; | |
168 | |
169 switch (disalign) { | |
170 case 0: | |
171 do { | |
172 uint64_t p1_l, p1_r, p2_l, p2_r; | |
173 uint64_t l, r; | |
174 | |
175 p1_l = ldq(pix1); | |
176 p1_r = ldq(pix1 + 8); | |
177 l = ldq(pix2); | |
178 r = ldq(pix2 + 8); | |
179 p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56)); | |
180 p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56)); | |
181 pix1 += line_size; | |
182 pix2 += line_size; | |
183 | |
184 result += perr(p1_l, p2_l) | |
185 + perr(p1_r, p2_r); | |
186 } while (--h); | |
187 break; | |
188 case 7: | |
189 /* |.......l|lllllllr|rrrrrrr*| | |
190 This case is special because disalign1 would be 8, which | |
191 gets treated as 0 by extqh. At least it is a bit faster | |
192 that way :) */ | |
193 do { | |
194 uint64_t p1_l, p1_r, p2_l, p2_r; | |
195 uint64_t l, m, r; | |
196 | |
197 p1_l = ldq(pix1); | |
198 p1_r = ldq(pix1 + 8); | |
199 l = ldq_u(pix2); | |
200 m = ldq_u(pix2 + 8); | |
201 r = ldq_u(pix2 + 16); | |
202 p2_l = avg2(extql(l, disalign) | extqh(m, disalign), m); | |
203 p2_r = avg2(extql(m, disalign) | extqh(r, disalign), r); | |
204 pix1 += line_size; | |
205 pix2 += line_size; | |
206 | |
207 result += perr(p1_l, p2_l) | |
208 + perr(p1_r, p2_r); | |
209 } while (--h); | |
210 break; | |
211 default: | |
212 do { | |
213 uint64_t disalign1 = disalign + 1; | |
214 uint64_t p1_l, p1_r, p2_l, p2_r; | |
215 uint64_t l, m, r; | |
216 | |
217 p1_l = ldq(pix1); | |
218 p1_r = ldq(pix1 + 8); | |
219 l = ldq_u(pix2); | |
220 m = ldq_u(pix2 + 8); | |
221 r = ldq_u(pix2 + 16); | |
222 p2_l = avg2(extql(l, disalign) | extqh(m, disalign), | |
223 extql(l, disalign1) | extqh(m, disalign1)); | |
224 p2_r = avg2(extql(m, disalign) | extqh(r, disalign), | |
225 extql(m, disalign1) | extqh(r, disalign1)); | |
226 pix1 += line_size; | |
227 pix2 += line_size; | |
228 | |
229 result += perr(p1_l, p2_l) | |
230 + perr(p1_r, p2_r); | |
231 } while (--h); | |
232 break; | |
233 } | |
234 return result; | |
235 } | |
236 | |
237 int pix_abs16x16_y2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) | |
238 { | |
239 int result = 0; | |
240 int h = 16; | |
241 | |
242 if ((size_t) pix2 & 0x7) { | |
243 uint64_t t, p2_l, p2_r; | |
244 t = ldq_u(pix2 + 8); | |
245 p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); | |
246 p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); | |
247 | |
248 do { | |
249 uint64_t p1_l, p1_r, np2_l, np2_r; | |
250 uint64_t t; | |
251 | |
252 p1_l = ldq(pix1); | |
253 p1_r = ldq(pix1 + 8); | |
254 pix2 += line_size; | |
255 t = ldq_u(pix2 + 8); | |
256 np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); | |
257 np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); | |
258 | |
259 result += perr(p1_l, avg2(p2_l, np2_l)) | |
260 + perr(p1_r, avg2(p2_r, np2_r)); | |
261 | |
262 pix1 += line_size; | |
263 p2_l = np2_l; | |
264 p2_r = np2_r; | |
265 | |
266 } while (--h); | |
267 } else { | |
268 uint64_t p2_l, p2_r; | |
269 p2_l = ldq(pix2); | |
270 p2_r = ldq(pix2 + 8); | |
271 do { | |
272 uint64_t p1_l, p1_r, np2_l, np2_r; | |
273 | |
274 p1_l = ldq(pix1); | |
275 p1_r = ldq(pix1 + 8); | |
276 pix2 += line_size; | |
277 np2_l = ldq(pix2); | |
278 np2_r = ldq(pix2 + 8); | |
279 | |
280 result += perr(p1_l, avg2(p2_l, np2_l)) | |
281 + perr(p1_r, avg2(p2_r, np2_r)); | |
282 | |
283 pix1 += line_size; | |
284 p2_l = np2_l; | |
285 p2_r = np2_r; | |
286 } while (--h); | |
287 } | |
288 return result; | |
289 } | |
290 | |
291 int pix_abs16x16_xy2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) | |
292 { | |
293 int result = 0; | |
294 int h = 16; | |
295 | |
296 uint64_t p1_l, p1_r; | |
297 uint64_t p2_l, p2_r, p2_x; | |
298 | |
299 p1_l = ldq(pix1); | |
300 p1_r = ldq(pix1 + 8); | |
301 | |
302 if ((size_t) pix2 & 0x7) { /* could be optimized a lot */ | |
303 p2_l = uldq(pix2); | |
304 p2_r = uldq(pix2 + 8); | |
305 p2_x = (uint64_t) pix2[16] << 56; | |
306 } else { | |
307 p2_l = ldq(pix2); | |
308 p2_r = ldq(pix2 + 8); | |
309 p2_x = ldq(pix2 + 16) << 56; | |
310 } | |
311 | |
312 do { | |
313 uint64_t np1_l, np1_r; | |
314 uint64_t np2_l, np2_r, np2_x; | |
315 | |
316 pix1 += line_size; | |
317 pix2 += line_size; | |
318 | |
319 np1_l = ldq(pix1); | |
320 np1_r = ldq(pix1 + 8); | |
321 | |
322 if ((size_t) pix2 & 0x7) { /* could be optimized a lot */ | |
323 np2_l = uldq(pix2); | |
324 np2_r = uldq(pix2 + 8); | |
325 np2_x = (uint64_t) pix2[16] << 56; | |
326 } else { | |
327 np2_l = ldq(pix2); | |
328 np2_r = ldq(pix2 + 8); | |
329 np2_x = ldq(pix2 + 16) << 56; | |
330 } | |
331 | |
332 result += perr(p1_l, | |
333 avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56), | |
334 np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56))) | |
335 + perr(p1_r, | |
336 avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x), | |
337 np2_r, (np2_r >> 8) | ((uint64_t) np2_x))); | |
338 | |
339 p1_l = np1_l; | |
340 p1_r = np1_r; | |
341 p2_l = np2_l; | |
342 p2_r = np2_r; | |
343 p2_x = np2_x; | |
344 } while (--h); | |
345 | |
346 return result; | |
347 } |