586
|
1 /*
|
|
2 * Alpha optimized DSP utils
|
|
3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
|
|
4 *
|
|
5 * This library is free software; you can redistribute it and/or
|
|
6 * modify it under the terms of the GNU Lesser General Public
|
|
7 * License as published by the Free Software Foundation; either
|
|
8 * version 2 of the License, or (at your option) any later version.
|
|
9 *
|
|
10 * This library is distributed in the hope that it will be useful,
|
|
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
13 * Lesser General Public License for more details.
|
|
14 *
|
|
15 * You should have received a copy of the GNU Lesser General Public
|
|
16 * License along with this library; if not, write to the Free Software
|
|
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
18 */
|
|
19
|
|
20 #include "asm.h"
|
|
21 #include "../dsputil.h"
|
|
22
|
|
23 void get_pixels_mvi(DCTELEM *restrict block,
|
|
24 const uint8_t *restrict pixels, int line_size)
|
|
25 {
|
|
26 int h = 8;
|
|
27
|
|
28 do {
|
|
29 uint64_t p;
|
|
30
|
|
31 p = ldq(pixels);
|
|
32 stq(unpkbw(p), block);
|
|
33 stq(unpkbw(p >> 32), block + 4);
|
|
34
|
|
35 pixels += line_size;
|
|
36 block += 8;
|
|
37 } while (--h);
|
|
38 }
|
|
39
|
|
40 void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2,
|
|
41 int stride) {
|
|
42 int h = 8;
|
|
43 uint64_t mask = 0x4040;
|
|
44
|
|
45 mask |= mask << 16;
|
|
46 mask |= mask << 32;
|
|
47 do {
|
|
48 uint64_t x, y, c, d, a;
|
|
49 uint64_t signs;
|
|
50
|
|
51 x = ldq(s1);
|
|
52 y = ldq(s2);
|
|
53 c = cmpbge(x, y);
|
|
54 d = x - y;
|
|
55 a = zap(mask, c); /* We use 0x4040404040404040 here... */
|
|
56 d += 4 * a; /* ...so we can use s4addq here. */
|
|
57 signs = zap(-1, c);
|
|
58
|
|
59 stq(unpkbw(d) | (unpkbw(signs) << 8), block);
|
|
60 stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4);
|
|
61
|
|
62 s1 += stride;
|
|
63 s2 += stride;
|
|
64 block += 8;
|
|
65 } while (--h);
|
|
66 }
|
|
67
|
|
68 static inline uint64_t avg2(uint64_t a, uint64_t b)
|
|
69 {
|
|
70 return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
|
|
71 }
|
|
72
|
|
73 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
|
|
74 {
|
|
75 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
|
|
76 + ((l2 & ~BYTE_VEC(0x03)) >> 2)
|
|
77 + ((l3 & ~BYTE_VEC(0x03)) >> 2)
|
|
78 + ((l4 & ~BYTE_VEC(0x03)) >> 2);
|
|
79 uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
|
|
80 + (l2 & BYTE_VEC(0x03))
|
|
81 + (l3 & BYTE_VEC(0x03))
|
|
82 + (l4 & BYTE_VEC(0x03))
|
|
83 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
|
|
84 return r1 + r2;
|
|
85 }
|
|
86
|
|
87 int pix_abs8x8_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
|
|
88 {
|
|
89 int result = 0;
|
|
90 int h = 8;
|
|
91
|
|
92 if ((size_t) pix2 & 0x7) {
|
|
93 /* works only when pix2 is actually unaligned */
|
|
94 do { /* do 8 pixel a time */
|
|
95 uint64_t p1, p2;
|
|
96
|
|
97 p1 = ldq(pix1);
|
|
98 p2 = uldq(pix2);
|
|
99 result += perr(p1, p2);
|
|
100
|
|
101 pix1 += line_size;
|
|
102 pix2 += line_size;
|
|
103 } while (--h);
|
|
104 } else {
|
|
105 do {
|
|
106 uint64_t p1, p2;
|
|
107
|
|
108 p1 = ldq(pix1);
|
|
109 p2 = ldq(pix2);
|
|
110 result += perr(p1, p2);
|
|
111
|
|
112 pix1 += line_size;
|
|
113 pix2 += line_size;
|
|
114 } while (--h);
|
|
115 }
|
|
116
|
|
117 return result;
|
|
118 }
|
|
119
|
|
120 int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
|
|
121 {
|
|
122 int result = 0;
|
|
123 int h = 16;
|
|
124
|
|
125 if ((size_t) pix2 & 0x7) {
|
|
126 /* works only when pix2 is actually unaligned */
|
|
127 do { /* do 16 pixel a time */
|
|
128 uint64_t p1_l, p1_r, p2_l, p2_r;
|
|
129 uint64_t t;
|
|
130
|
|
131 p1_l = ldq(pix1);
|
|
132 p1_r = ldq(pix1 + 8);
|
|
133 t = ldq_u(pix2 + 8);
|
|
134 p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
|
|
135 p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
|
|
136 pix1 += line_size;
|
|
137 pix2 += line_size;
|
|
138
|
|
139 result += perr(p1_l, p2_l)
|
|
140 + perr(p1_r, p2_r);
|
|
141 } while (--h);
|
|
142 } else {
|
|
143 do {
|
|
144 uint64_t p1_l, p1_r, p2_l, p2_r;
|
|
145
|
|
146 p1_l = ldq(pix1);
|
|
147 p1_r = ldq(pix1 + 8);
|
|
148 p2_l = ldq(pix2);
|
|
149 p2_r = ldq(pix2 + 8);
|
|
150 pix1 += line_size;
|
|
151 pix2 += line_size;
|
|
152
|
|
153 result += perr(p1_l, p2_l)
|
|
154 + perr(p1_r, p2_r);
|
|
155 } while (--h);
|
|
156 }
|
|
157
|
|
158 return result;
|
|
159 }
|
|
160
|
|
161 int pix_abs16x16_x2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
|
|
162 {
|
|
163 int result = 0;
|
|
164 int h = 16;
|
|
165 uint64_t disalign = (size_t) pix2 & 0x7;
|
|
166
|
|
167 switch (disalign) {
|
|
168 case 0:
|
|
169 do {
|
|
170 uint64_t p1_l, p1_r, p2_l, p2_r;
|
|
171 uint64_t l, r;
|
|
172
|
|
173 p1_l = ldq(pix1);
|
|
174 p1_r = ldq(pix1 + 8);
|
|
175 l = ldq(pix2);
|
|
176 r = ldq(pix2 + 8);
|
|
177 p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
|
|
178 p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
|
|
179 pix1 += line_size;
|
|
180 pix2 += line_size;
|
|
181
|
|
182 result += perr(p1_l, p2_l)
|
|
183 + perr(p1_r, p2_r);
|
|
184 } while (--h);
|
|
185 break;
|
|
186 case 7:
|
|
187 /* |.......l|lllllllr|rrrrrrr*|
|
|
188 This case is special because disalign1 would be 8, which
|
|
189 gets treated as 0 by extqh. At least it is a bit faster
|
|
190 that way :) */
|
|
191 do {
|
|
192 uint64_t p1_l, p1_r, p2_l, p2_r;
|
|
193 uint64_t l, m, r;
|
|
194
|
|
195 p1_l = ldq(pix1);
|
|
196 p1_r = ldq(pix1 + 8);
|
|
197 l = ldq_u(pix2);
|
|
198 m = ldq_u(pix2 + 8);
|
|
199 r = ldq_u(pix2 + 16);
|
|
200 p2_l = avg2(extql(l, disalign) | extqh(m, disalign), m);
|
|
201 p2_r = avg2(extql(m, disalign) | extqh(r, disalign), r);
|
|
202 pix1 += line_size;
|
|
203 pix2 += line_size;
|
|
204
|
|
205 result += perr(p1_l, p2_l)
|
|
206 + perr(p1_r, p2_r);
|
|
207 } while (--h);
|
|
208 break;
|
|
209 default:
|
|
210 do {
|
|
211 uint64_t disalign1 = disalign + 1;
|
|
212 uint64_t p1_l, p1_r, p2_l, p2_r;
|
|
213 uint64_t l, m, r;
|
|
214
|
|
215 p1_l = ldq(pix1);
|
|
216 p1_r = ldq(pix1 + 8);
|
|
217 l = ldq_u(pix2);
|
|
218 m = ldq_u(pix2 + 8);
|
|
219 r = ldq_u(pix2 + 16);
|
|
220 p2_l = avg2(extql(l, disalign) | extqh(m, disalign),
|
|
221 extql(l, disalign1) | extqh(m, disalign1));
|
|
222 p2_r = avg2(extql(m, disalign) | extqh(r, disalign),
|
|
223 extql(m, disalign1) | extqh(r, disalign1));
|
|
224 pix1 += line_size;
|
|
225 pix2 += line_size;
|
|
226
|
|
227 result += perr(p1_l, p2_l)
|
|
228 + perr(p1_r, p2_r);
|
|
229 } while (--h);
|
|
230 break;
|
|
231 }
|
|
232 return result;
|
|
233 }
|
|
234
|
|
235 int pix_abs16x16_y2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
|
|
236 {
|
|
237 int result = 0;
|
|
238 int h = 16;
|
|
239
|
|
240 if ((size_t) pix2 & 0x7) {
|
|
241 uint64_t t, p2_l, p2_r;
|
|
242 t = ldq_u(pix2 + 8);
|
|
243 p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
|
|
244 p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
|
|
245
|
|
246 do {
|
|
247 uint64_t p1_l, p1_r, np2_l, np2_r;
|
|
248 uint64_t t;
|
|
249
|
|
250 p1_l = ldq(pix1);
|
|
251 p1_r = ldq(pix1 + 8);
|
|
252 pix2 += line_size;
|
|
253 t = ldq_u(pix2 + 8);
|
|
254 np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
|
|
255 np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
|
|
256
|
|
257 result += perr(p1_l, avg2(p2_l, np2_l))
|
|
258 + perr(p1_r, avg2(p2_r, np2_r));
|
|
259
|
|
260 pix1 += line_size;
|
|
261 p2_l = np2_l;
|
|
262 p2_r = np2_r;
|
|
263
|
|
264 } while (--h);
|
|
265 } else {
|
|
266 uint64_t p2_l, p2_r;
|
|
267 p2_l = ldq(pix2);
|
|
268 p2_r = ldq(pix2 + 8);
|
|
269 do {
|
|
270 uint64_t p1_l, p1_r, np2_l, np2_r;
|
|
271
|
|
272 p1_l = ldq(pix1);
|
|
273 p1_r = ldq(pix1 + 8);
|
|
274 pix2 += line_size;
|
|
275 np2_l = ldq(pix2);
|
|
276 np2_r = ldq(pix2 + 8);
|
|
277
|
|
278 result += perr(p1_l, avg2(p2_l, np2_l))
|
|
279 + perr(p1_r, avg2(p2_r, np2_r));
|
|
280
|
|
281 pix1 += line_size;
|
|
282 p2_l = np2_l;
|
|
283 p2_r = np2_r;
|
|
284 } while (--h);
|
|
285 }
|
|
286 return result;
|
|
287 }
|
|
288
|
|
289 int pix_abs16x16_xy2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
|
|
290 {
|
|
291 int result = 0;
|
|
292 int h = 16;
|
|
293
|
|
294 uint64_t p1_l, p1_r;
|
|
295 uint64_t p2_l, p2_r, p2_x;
|
|
296
|
|
297 p1_l = ldq(pix1);
|
|
298 p1_r = ldq(pix1 + 8);
|
|
299
|
|
300 if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
|
|
301 p2_l = uldq(pix2);
|
|
302 p2_r = uldq(pix2 + 8);
|
|
303 p2_x = (uint64_t) pix2[16] << 56;
|
|
304 } else {
|
|
305 p2_l = ldq(pix2);
|
|
306 p2_r = ldq(pix2 + 8);
|
|
307 p2_x = ldq(pix2 + 16) << 56;
|
|
308 }
|
|
309
|
|
310 do {
|
|
311 uint64_t np1_l, np1_r;
|
|
312 uint64_t np2_l, np2_r, np2_x;
|
|
313
|
|
314 pix1 += line_size;
|
|
315 pix2 += line_size;
|
|
316
|
|
317 np1_l = ldq(pix1);
|
|
318 np1_r = ldq(pix1 + 8);
|
|
319
|
|
320 if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
|
|
321 np2_l = uldq(pix2);
|
|
322 np2_r = uldq(pix2 + 8);
|
|
323 np2_x = (uint64_t) pix2[16] << 56;
|
|
324 } else {
|
|
325 np2_l = ldq(pix2);
|
|
326 np2_r = ldq(pix2 + 8);
|
|
327 np2_x = ldq(pix2 + 16) << 56;
|
|
328 }
|
|
329
|
|
330 result += perr(p1_l,
|
|
331 avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56),
|
|
332 np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
|
|
333 + perr(p1_r,
|
|
334 avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x),
|
|
335 np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
|
|
336
|
|
337 p1_l = np1_l;
|
|
338 p1_r = np1_r;
|
|
339 p2_l = np2_l;
|
|
340 p2_r = np2_r;
|
|
341 p2_x = np2_x;
|
|
342 } while (--h);
|
|
343
|
|
344 return result;
|
|
345 }
|