Mercurial > libavcodec.hg
annotate sparc/dsputil_vis.c @ 12043:f9a0bd0888a4 libavcodec
mpegaudio: call ff_mpegaudiodec_init_mmx() only from float decoder
The mmx code is floating-point only, and this function does not know
from which decoder it is called. Without this change, the integer
decoder only "works" because the size of the context struct is smaller
in this case, and the mmx init function writes the function pointer
outside the allocated context.
author | mru |
---|---|
date | Thu, 01 Jul 2010 23:21:17 +0000 |
parents | 7be32921237f |
children |
rev | line source |
---|---|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1 /* |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2 * Copyright (C) 2003 David S. Miller <davem@redhat.com> |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
4 * This file is part of FFmpeg. |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
5 * |
3987
2c54309fef91
Switch to the LGPL as agreed to by the author according to the
diego
parents:
3947
diff
changeset
|
6 * FFmpeg is free software; you can redistribute it and/or |
2c54309fef91
Switch to the LGPL as agreed to by the author according to the
diego
parents:
3947
diff
changeset
|
7 * modify it under the terms of the GNU Lesser General Public |
2c54309fef91
Switch to the LGPL as agreed to by the author according to the
diego
parents:
3947
diff
changeset
|
8 * License as published by the Free Software Foundation; either |
2c54309fef91
Switch to the LGPL as agreed to by the author according to the
diego
parents:
3947
diff
changeset
|
9 * version 2.1 of the License, or (at your option) any later version. |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
10 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
11 * FFmpeg is distributed in the hope that it will be useful, |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
3987
2c54309fef91
Switch to the LGPL as agreed to by the author according to the
diego
parents:
3947
diff
changeset
|
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
2c54309fef91
Switch to the LGPL as agreed to by the author according to the
diego
parents:
3947
diff
changeset
|
14 * Lesser General Public License for more details. |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
15 * |
3987
2c54309fef91
Switch to the LGPL as agreed to by the author according to the
diego
parents:
3947
diff
changeset
|
16 * You should have received a copy of the GNU Lesser General Public |
2c54309fef91
Switch to the LGPL as agreed to by the author according to the
diego
parents:
3947
diff
changeset
|
17 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
19 */ |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
20 |
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
21 /* The *no_round* functions have been added by James A. Morrison, 2003,2004. |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
22 The vis code from libmpeg2 was adapted for ffmpeg by James A. Morrison. |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
23 */ |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
24 |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
25 #include "config.h" |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
26 |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
27 #include <inttypes.h> |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
28 |
6763 | 29 #include "libavcodec/dsputil.h" |
11457 | 30 #include "dsputil_vis.h" |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
31 |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
32 #include "vis.h" |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
33 |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
34 /* The trick used in some of this file is the formula from the MMX |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
35 * motion comp code, which is: |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
36 * |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
37 * (x+y+1)>>1 == (x|y)-((x^y)>>1) |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
38 * |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
39 * This allows us to average 8 bytes at a time in a 64-bit FPU reg. |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
40 * We avoid overflows by masking before we do the shift, and we |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
41 * implement the shift by multiplying by 1/2 using mul8x16. So in |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
42 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
43 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
44 * the value 0x80808080 is in f8): |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
45 * |
2979 | 46 * fxor f0, f2, f10 |
47 * fand f10, f4, f10 | |
48 * fmul8x16 f8, f10, f10 | |
49 * fand f10, f6, f10 | |
50 * for f0, f2, f12 | |
51 * fpsub16 f12, f10, f10 | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
52 */ |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
53 |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
54 #define DUP4(x) {x, x, x, x} |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
55 #define DUP8(x) {x, x, x, x, x, x, x, x} |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11458
diff
changeset
|
56 DECLARE_ALIGNED(8, static const int16_t, constants1)[] = DUP4 (1); |
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11458
diff
changeset
|
57 DECLARE_ALIGNED(8, static const int16_t, constants2)[] = DUP4 (2); |
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11458
diff
changeset
|
58 DECLARE_ALIGNED(8, static const int16_t, constants3)[] = DUP4 (3); |
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11458
diff
changeset
|
59 DECLARE_ALIGNED(8, static const int16_t, constants6)[] = DUP4 (6); |
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11458
diff
changeset
|
60 DECLARE_ALIGNED(8, static const int8_t, constants_fe)[] = DUP8 (0xfe); |
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11458
diff
changeset
|
61 DECLARE_ALIGNED(8, static const int8_t, constants_7f)[] = DUP8 (0x7f); |
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11458
diff
changeset
|
62 DECLARE_ALIGNED(8, static const int8_t, constants128)[] = DUP8 (128); |
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11458
diff
changeset
|
63 DECLARE_ALIGNED(8, static const int16_t, constants256_512)[] = |
2979 | 64 {256, 512, 256, 512}; |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11458
diff
changeset
|
65 DECLARE_ALIGNED(8, static const int16_t, constants256_1024)[] = |
2979 | 66 {256, 1024, 256, 1024}; |
67 | |
68 #define REF_0 0 | |
69 #define REF_0_1 1 | |
70 #define REF_2 2 | |
71 #define REF_2_1 3 | |
72 #define REF_4 4 | |
73 #define REF_4_1 5 | |
74 #define REF_6 6 | |
75 #define REF_6_1 7 | |
76 #define REF_S0 8 | |
77 #define REF_S0_1 9 | |
78 #define REF_S2 10 | |
79 #define REF_S2_1 11 | |
80 #define REF_S4 12 | |
81 #define REF_S4_1 13 | |
82 #define REF_S6 14 | |
83 #define REF_S6_1 15 | |
84 #define DST_0 16 | |
85 #define DST_1 17 | |
86 #define DST_2 18 | |
87 #define DST_3 19 | |
88 #define CONST_1 20 | |
89 #define CONST_2 20 | |
90 #define CONST_3 20 | |
91 #define CONST_6 20 | |
92 #define MASK_fe 20 | |
93 #define CONST_128 22 | |
94 #define CONST_256 22 | |
95 #define CONST_512 22 | |
96 #define CONST_1024 22 | |
97 #define TMP0 24 | |
98 #define TMP1 25 | |
99 #define TMP2 26 | |
100 #define TMP3 27 | |
101 #define TMP4 28 | |
102 #define TMP5 29 | |
103 #define ZERO 30 | |
104 #define MASK_7f 30 | |
105 | |
106 #define TMP6 32 | |
107 #define TMP8 34 | |
108 #define TMP10 36 | |
109 #define TMP12 38 | |
110 #define TMP14 40 | |
111 #define TMP16 42 | |
112 #define TMP18 44 | |
113 #define TMP20 46 | |
114 #define TMP22 48 | |
115 #define TMP24 50 | |
116 #define TMP26 52 | |
117 #define TMP28 54 | |
118 #define TMP30 56 | |
119 #define TMP32 58 | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
120 |
11458 | 121 static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 122 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
123 { |
2979 | 124 ref = vis_alignaddr(ref); |
125 do { /* 5 cycles */ | |
126 vis_ld64(ref[0], TMP0); | |
127 | |
128 vis_ld64_2(ref, 8, TMP2); | |
129 | |
130 vis_ld64_2(ref, 16, TMP4); | |
131 ref += stride; | |
132 | |
133 vis_faligndata(TMP0, TMP2, REF_0); | |
134 vis_st64(REF_0, dest[0]); | |
135 | |
136 vis_faligndata(TMP2, TMP4, REF_2); | |
137 vis_st64_2(REF_2, dest, 8); | |
138 dest += stride; | |
139 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
140 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
141 |
11458 | 142 static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 143 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
144 { |
2979 | 145 ref = vis_alignaddr(ref); |
146 do { /* 4 cycles */ | |
147 vis_ld64(ref[0], TMP0); | |
148 | |
149 vis_ld64(ref[8], TMP2); | |
150 ref += stride; | |
151 | |
152 /* stall */ | |
153 | |
154 vis_faligndata(TMP0, TMP2, REF_0); | |
155 vis_st64(REF_0, dest[0]); | |
156 dest += stride; | |
157 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
158 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
159 |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
160 |
11458 | 161 static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 162 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
163 { |
2979 | 164 int stride_8 = stride + 8; |
165 | |
166 ref = vis_alignaddr(ref); | |
167 | |
168 vis_ld64(ref[0], TMP0); | |
169 | |
170 vis_ld64(ref[8], TMP2); | |
171 | |
172 vis_ld64(ref[16], TMP4); | |
173 | |
174 vis_ld64(dest[0], DST_0); | |
175 | |
176 vis_ld64(dest[8], DST_2); | |
177 | |
178 vis_ld64(constants_fe[0], MASK_fe); | |
179 vis_faligndata(TMP0, TMP2, REF_0); | |
180 | |
181 vis_ld64(constants_7f[0], MASK_7f); | |
182 vis_faligndata(TMP2, TMP4, REF_2); | |
183 | |
184 vis_ld64(constants128[0], CONST_128); | |
185 | |
186 ref += stride; | |
187 height = (height >> 1) - 1; | |
188 | |
189 do { /* 24 cycles */ | |
190 vis_ld64(ref[0], TMP0); | |
191 vis_xor(DST_0, REF_0, TMP6); | |
192 | |
193 vis_ld64_2(ref, 8, TMP2); | |
194 vis_and(TMP6, MASK_fe, TMP6); | |
195 | |
196 vis_ld64_2(ref, 16, TMP4); | |
197 ref += stride; | |
198 vis_mul8x16(CONST_128, TMP6, TMP6); | |
199 vis_xor(DST_2, REF_2, TMP8); | |
200 | |
201 vis_and(TMP8, MASK_fe, TMP8); | |
202 | |
203 vis_or(DST_0, REF_0, TMP10); | |
204 vis_ld64_2(dest, stride, DST_0); | |
205 vis_mul8x16(CONST_128, TMP8, TMP8); | |
206 | |
207 vis_or(DST_2, REF_2, TMP12); | |
208 vis_ld64_2(dest, stride_8, DST_2); | |
209 | |
210 vis_ld64(ref[0], TMP14); | |
211 vis_and(TMP6, MASK_7f, TMP6); | |
212 | |
213 vis_and(TMP8, MASK_7f, TMP8); | |
214 | |
215 vis_psub16(TMP10, TMP6, TMP6); | |
216 vis_st64(TMP6, dest[0]); | |
217 | |
218 vis_psub16(TMP12, TMP8, TMP8); | |
219 vis_st64_2(TMP8, dest, 8); | |
220 | |
221 dest += stride; | |
222 vis_ld64_2(ref, 8, TMP16); | |
223 vis_faligndata(TMP0, TMP2, REF_0); | |
224 | |
225 vis_ld64_2(ref, 16, TMP18); | |
226 vis_faligndata(TMP2, TMP4, REF_2); | |
227 ref += stride; | |
228 | |
229 vis_xor(DST_0, REF_0, TMP20); | |
230 | |
231 vis_and(TMP20, MASK_fe, TMP20); | |
232 | |
233 vis_xor(DST_2, REF_2, TMP22); | |
234 vis_mul8x16(CONST_128, TMP20, TMP20); | |
235 | |
236 vis_and(TMP22, MASK_fe, TMP22); | |
237 | |
238 vis_or(DST_0, REF_0, TMP24); | |
239 vis_mul8x16(CONST_128, TMP22, TMP22); | |
240 | |
241 vis_or(DST_2, REF_2, TMP26); | |
242 | |
243 vis_ld64_2(dest, stride, DST_0); | |
244 vis_faligndata(TMP14, TMP16, REF_0); | |
245 | |
246 vis_ld64_2(dest, stride_8, DST_2); | |
247 vis_faligndata(TMP16, TMP18, REF_2); | |
248 | |
249 vis_and(TMP20, MASK_7f, TMP20); | |
250 | |
251 vis_and(TMP22, MASK_7f, TMP22); | |
252 | |
253 vis_psub16(TMP24, TMP20, TMP20); | |
254 vis_st64(TMP20, dest[0]); | |
255 | |
256 vis_psub16(TMP26, TMP22, TMP22); | |
257 vis_st64_2(TMP22, dest, 8); | |
258 dest += stride; | |
259 } while (--height); | |
260 | |
261 vis_ld64(ref[0], TMP0); | |
262 vis_xor(DST_0, REF_0, TMP6); | |
263 | |
264 vis_ld64_2(ref, 8, TMP2); | |
265 vis_and(TMP6, MASK_fe, TMP6); | |
266 | |
267 vis_ld64_2(ref, 16, TMP4); | |
268 vis_mul8x16(CONST_128, TMP6, TMP6); | |
269 vis_xor(DST_2, REF_2, TMP8); | |
270 | |
271 vis_and(TMP8, MASK_fe, TMP8); | |
272 | |
273 vis_or(DST_0, REF_0, TMP10); | |
274 vis_ld64_2(dest, stride, DST_0); | |
275 vis_mul8x16(CONST_128, TMP8, TMP8); | |
276 | |
277 vis_or(DST_2, REF_2, TMP12); | |
278 vis_ld64_2(dest, stride_8, DST_2); | |
279 | |
280 vis_ld64(ref[0], TMP14); | |
281 vis_and(TMP6, MASK_7f, TMP6); | |
282 | |
283 vis_and(TMP8, MASK_7f, TMP8); | |
284 | |
285 vis_psub16(TMP10, TMP6, TMP6); | |
286 vis_st64(TMP6, dest[0]); | |
287 | |
288 vis_psub16(TMP12, TMP8, TMP8); | |
289 vis_st64_2(TMP8, dest, 8); | |
290 | |
291 dest += stride; | |
292 vis_faligndata(TMP0, TMP2, REF_0); | |
293 | |
294 vis_faligndata(TMP2, TMP4, REF_2); | |
295 | |
296 vis_xor(DST_0, REF_0, TMP20); | |
297 | |
298 vis_and(TMP20, MASK_fe, TMP20); | |
299 | |
300 vis_xor(DST_2, REF_2, TMP22); | |
301 vis_mul8x16(CONST_128, TMP20, TMP20); | |
302 | |
303 vis_and(TMP22, MASK_fe, TMP22); | |
304 | |
305 vis_or(DST_0, REF_0, TMP24); | |
306 vis_mul8x16(CONST_128, TMP22, TMP22); | |
307 | |
308 vis_or(DST_2, REF_2, TMP26); | |
309 | |
310 vis_and(TMP20, MASK_7f, TMP20); | |
311 | |
312 vis_and(TMP22, MASK_7f, TMP22); | |
313 | |
314 vis_psub16(TMP24, TMP20, TMP20); | |
315 vis_st64(TMP20, dest[0]); | |
316 | |
317 vis_psub16(TMP26, TMP22, TMP22); | |
318 vis_st64_2(TMP22, dest, 8); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
319 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
320 |
11458 | 321 static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 322 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
323 { |
2979 | 324 ref = vis_alignaddr(ref); |
325 | |
326 vis_ld64(ref[0], TMP0); | |
327 | |
328 vis_ld64(ref[8], TMP2); | |
329 | |
330 vis_ld64(dest[0], DST_0); | |
331 | |
332 vis_ld64(constants_fe[0], MASK_fe); | |
333 | |
334 vis_ld64(constants_7f[0], MASK_7f); | |
335 vis_faligndata(TMP0, TMP2, REF_0); | |
336 | |
337 vis_ld64(constants128[0], CONST_128); | |
338 | |
339 ref += stride; | |
340 height = (height >> 1) - 1; | |
341 | |
342 do { /* 12 cycles */ | |
343 vis_ld64(ref[0], TMP0); | |
344 vis_xor(DST_0, REF_0, TMP4); | |
345 | |
346 vis_ld64(ref[8], TMP2); | |
347 vis_and(TMP4, MASK_fe, TMP4); | |
348 | |
349 vis_or(DST_0, REF_0, TMP6); | |
350 vis_ld64_2(dest, stride, DST_0); | |
351 ref += stride; | |
352 vis_mul8x16(CONST_128, TMP4, TMP4); | |
353 | |
354 vis_ld64(ref[0], TMP12); | |
355 vis_faligndata(TMP0, TMP2, REF_0); | |
356 | |
357 vis_ld64(ref[8], TMP2); | |
358 vis_xor(DST_0, REF_0, TMP0); | |
359 ref += stride; | |
360 | |
361 vis_and(TMP0, MASK_fe, TMP0); | |
362 | |
363 vis_and(TMP4, MASK_7f, TMP4); | |
364 | |
365 vis_psub16(TMP6, TMP4, TMP4); | |
366 vis_st64(TMP4, dest[0]); | |
367 dest += stride; | |
368 vis_mul8x16(CONST_128, TMP0, TMP0); | |
369 | |
370 vis_or(DST_0, REF_0, TMP6); | |
371 vis_ld64_2(dest, stride, DST_0); | |
372 | |
373 vis_faligndata(TMP12, TMP2, REF_0); | |
374 | |
375 vis_and(TMP0, MASK_7f, TMP0); | |
376 | |
377 vis_psub16(TMP6, TMP0, TMP4); | |
378 vis_st64(TMP4, dest[0]); | |
379 dest += stride; | |
380 } while (--height); | |
381 | |
382 vis_ld64(ref[0], TMP0); | |
383 vis_xor(DST_0, REF_0, TMP4); | |
384 | |
385 vis_ld64(ref[8], TMP2); | |
386 vis_and(TMP4, MASK_fe, TMP4); | |
387 | |
388 vis_or(DST_0, REF_0, TMP6); | |
389 vis_ld64_2(dest, stride, DST_0); | |
390 vis_mul8x16(CONST_128, TMP4, TMP4); | |
391 | |
392 vis_faligndata(TMP0, TMP2, REF_0); | |
393 | |
394 vis_xor(DST_0, REF_0, TMP0); | |
395 | |
396 vis_and(TMP0, MASK_fe, TMP0); | |
397 | |
398 vis_and(TMP4, MASK_7f, TMP4); | |
399 | |
400 vis_psub16(TMP6, TMP4, TMP4); | |
401 vis_st64(TMP4, dest[0]); | |
402 dest += stride; | |
403 vis_mul8x16(CONST_128, TMP0, TMP0); | |
404 | |
405 vis_or(DST_0, REF_0, TMP6); | |
406 | |
407 vis_and(TMP0, MASK_7f, TMP0); | |
408 | |
409 vis_psub16(TMP6, TMP0, TMP4); | |
410 vis_st64(TMP4, dest[0]); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
411 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
412 |
11458 | 413 static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 414 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
415 { |
2979 | 416 unsigned long off = (unsigned long) ref & 0x7; |
417 unsigned long off_plus_1 = off + 1; | |
418 | |
419 ref = vis_alignaddr(ref); | |
420 | |
421 vis_ld64(ref[0], TMP0); | |
422 | |
423 vis_ld64_2(ref, 8, TMP2); | |
424 | |
425 vis_ld64_2(ref, 16, TMP4); | |
426 | |
427 vis_ld64(constants_fe[0], MASK_fe); | |
428 | |
429 vis_ld64(constants_7f[0], MASK_7f); | |
430 vis_faligndata(TMP0, TMP2, REF_0); | |
431 | |
432 vis_ld64(constants128[0], CONST_128); | |
433 vis_faligndata(TMP2, TMP4, REF_4); | |
434 | |
435 if (off != 0x7) { | |
436 vis_alignaddr_g0((void *)off_plus_1); | |
437 vis_faligndata(TMP0, TMP2, REF_2); | |
438 vis_faligndata(TMP2, TMP4, REF_6); | |
439 } else { | |
440 vis_src1(TMP2, REF_2); | |
441 vis_src1(TMP4, REF_6); | |
442 } | |
443 | |
444 ref += stride; | |
445 height = (height >> 1) - 1; | |
446 | |
447 do { /* 34 cycles */ | |
448 vis_ld64(ref[0], TMP0); | |
449 vis_xor(REF_0, REF_2, TMP6); | |
450 | |
451 vis_ld64_2(ref, 8, TMP2); | |
452 vis_xor(REF_4, REF_6, TMP8); | |
453 | |
454 vis_ld64_2(ref, 16, TMP4); | |
455 vis_and(TMP6, MASK_fe, TMP6); | |
456 ref += stride; | |
457 | |
458 vis_ld64(ref[0], TMP14); | |
459 vis_mul8x16(CONST_128, TMP6, TMP6); | |
460 vis_and(TMP8, MASK_fe, TMP8); | |
461 | |
462 vis_ld64_2(ref, 8, TMP16); | |
463 vis_mul8x16(CONST_128, TMP8, TMP8); | |
464 vis_or(REF_0, REF_2, TMP10); | |
465 | |
466 vis_ld64_2(ref, 16, TMP18); | |
467 ref += stride; | |
468 vis_or(REF_4, REF_6, TMP12); | |
469 | |
470 vis_alignaddr_g0((void *)off); | |
471 | |
472 vis_faligndata(TMP0, TMP2, REF_0); | |
473 | |
474 vis_faligndata(TMP2, TMP4, REF_4); | |
475 | |
476 if (off != 0x7) { | |
477 vis_alignaddr_g0((void *)off_plus_1); | |
478 vis_faligndata(TMP0, TMP2, REF_2); | |
479 vis_faligndata(TMP2, TMP4, REF_6); | |
480 } else { | |
481 vis_src1(TMP2, REF_2); | |
482 vis_src1(TMP4, REF_6); | |
483 } | |
484 | |
485 vis_and(TMP6, MASK_7f, TMP6); | |
486 | |
487 vis_and(TMP8, MASK_7f, TMP8); | |
488 | |
489 vis_psub16(TMP10, TMP6, TMP6); | |
490 vis_st64(TMP6, dest[0]); | |
491 | |
492 vis_psub16(TMP12, TMP8, TMP8); | |
493 vis_st64_2(TMP8, dest, 8); | |
494 dest += stride; | |
495 | |
496 vis_xor(REF_0, REF_2, TMP6); | |
497 | |
498 vis_xor(REF_4, REF_6, TMP8); | |
499 | |
500 vis_and(TMP6, MASK_fe, TMP6); | |
501 | |
502 vis_mul8x16(CONST_128, TMP6, TMP6); | |
503 vis_and(TMP8, MASK_fe, TMP8); | |
504 | |
505 vis_mul8x16(CONST_128, TMP8, TMP8); | |
506 vis_or(REF_0, REF_2, TMP10); | |
507 | |
508 vis_or(REF_4, REF_6, TMP12); | |
509 | |
510 vis_alignaddr_g0((void *)off); | |
511 | |
512 vis_faligndata(TMP14, TMP16, REF_0); | |
513 | |
514 vis_faligndata(TMP16, TMP18, REF_4); | |
515 | |
516 if (off != 0x7) { | |
517 vis_alignaddr_g0((void *)off_plus_1); | |
518 vis_faligndata(TMP14, TMP16, REF_2); | |
519 vis_faligndata(TMP16, TMP18, REF_6); | |
520 } else { | |
521 vis_src1(TMP16, REF_2); | |
522 vis_src1(TMP18, REF_6); | |
523 } | |
524 | |
525 vis_and(TMP6, MASK_7f, TMP6); | |
526 | |
527 vis_and(TMP8, MASK_7f, TMP8); | |
528 | |
529 vis_psub16(TMP10, TMP6, TMP6); | |
530 vis_st64(TMP6, dest[0]); | |
531 | |
532 vis_psub16(TMP12, TMP8, TMP8); | |
533 vis_st64_2(TMP8, dest, 8); | |
534 dest += stride; | |
535 } while (--height); | |
536 | |
537 vis_ld64(ref[0], TMP0); | |
538 vis_xor(REF_0, REF_2, TMP6); | |
539 | |
540 vis_ld64_2(ref, 8, TMP2); | |
541 vis_xor(REF_4, REF_6, TMP8); | |
542 | |
543 vis_ld64_2(ref, 16, TMP4); | |
544 vis_and(TMP6, MASK_fe, TMP6); | |
545 | |
546 vis_mul8x16(CONST_128, TMP6, TMP6); | |
547 vis_and(TMP8, MASK_fe, TMP8); | |
548 | |
549 vis_mul8x16(CONST_128, TMP8, TMP8); | |
550 vis_or(REF_0, REF_2, TMP10); | |
551 | |
552 vis_or(REF_4, REF_6, TMP12); | |
553 | |
554 vis_alignaddr_g0((void *)off); | |
555 | |
556 vis_faligndata(TMP0, TMP2, REF_0); | |
557 | |
558 vis_faligndata(TMP2, TMP4, REF_4); | |
559 | |
560 if (off != 0x7) { | |
561 vis_alignaddr_g0((void *)off_plus_1); | |
562 vis_faligndata(TMP0, TMP2, REF_2); | |
563 vis_faligndata(TMP2, TMP4, REF_6); | |
564 } else { | |
565 vis_src1(TMP2, REF_2); | |
566 vis_src1(TMP4, REF_6); | |
567 } | |
568 | |
569 vis_and(TMP6, MASK_7f, TMP6); | |
570 | |
571 vis_and(TMP8, MASK_7f, TMP8); | |
572 | |
573 vis_psub16(TMP10, TMP6, TMP6); | |
574 vis_st64(TMP6, dest[0]); | |
575 | |
576 vis_psub16(TMP12, TMP8, TMP8); | |
577 vis_st64_2(TMP8, dest, 8); | |
578 dest += stride; | |
579 | |
580 vis_xor(REF_0, REF_2, TMP6); | |
581 | |
582 vis_xor(REF_4, REF_6, TMP8); | |
583 | |
584 vis_and(TMP6, MASK_fe, TMP6); | |
585 | |
586 vis_mul8x16(CONST_128, TMP6, TMP6); | |
587 vis_and(TMP8, MASK_fe, TMP8); | |
588 | |
589 vis_mul8x16(CONST_128, TMP8, TMP8); | |
590 vis_or(REF_0, REF_2, TMP10); | |
591 | |
592 vis_or(REF_4, REF_6, TMP12); | |
593 | |
594 vis_and(TMP6, MASK_7f, TMP6); | |
595 | |
596 vis_and(TMP8, MASK_7f, TMP8); | |
597 | |
598 vis_psub16(TMP10, TMP6, TMP6); | |
599 vis_st64(TMP6, dest[0]); | |
600 | |
601 vis_psub16(TMP12, TMP8, TMP8); | |
602 vis_st64_2(TMP8, dest, 8); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
603 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
604 |
11458 | 605 static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 606 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
607 { |
2979 | 608 unsigned long off = (unsigned long) ref & 0x7; |
609 unsigned long off_plus_1 = off + 1; | |
610 | |
611 ref = vis_alignaddr(ref); | |
612 | |
613 vis_ld64(ref[0], TMP0); | |
614 | |
615 vis_ld64(ref[8], TMP2); | |
616 | |
617 vis_ld64(constants_fe[0], MASK_fe); | |
618 | |
619 vis_ld64(constants_7f[0], MASK_7f); | |
620 | |
621 vis_ld64(constants128[0], CONST_128); | |
622 vis_faligndata(TMP0, TMP2, REF_0); | |
623 | |
624 if (off != 0x7) { | |
625 vis_alignaddr_g0((void *)off_plus_1); | |
626 vis_faligndata(TMP0, TMP2, REF_2); | |
627 } else { | |
628 vis_src1(TMP2, REF_2); | |
629 } | |
630 | |
631 ref += stride; | |
632 height = (height >> 1) - 1; | |
633 | |
634 do { /* 20 cycles */ | |
635 vis_ld64(ref[0], TMP0); | |
636 vis_xor(REF_0, REF_2, TMP4); | |
637 | |
638 vis_ld64_2(ref, 8, TMP2); | |
639 vis_and(TMP4, MASK_fe, TMP4); | |
640 ref += stride; | |
641 | |
642 vis_ld64(ref[0], TMP8); | |
643 vis_or(REF_0, REF_2, TMP6); | |
644 vis_mul8x16(CONST_128, TMP4, TMP4); | |
645 | |
646 vis_alignaddr_g0((void *)off); | |
647 | |
648 vis_ld64_2(ref, 8, TMP10); | |
649 ref += stride; | |
650 vis_faligndata(TMP0, TMP2, REF_0); | |
651 | |
652 if (off != 0x7) { | |
653 vis_alignaddr_g0((void *)off_plus_1); | |
654 vis_faligndata(TMP0, TMP2, REF_2); | |
655 } else { | |
656 vis_src1(TMP2, REF_2); | |
657 } | |
658 | |
659 vis_and(TMP4, MASK_7f, TMP4); | |
660 | |
661 vis_psub16(TMP6, TMP4, DST_0); | |
662 vis_st64(DST_0, dest[0]); | |
663 dest += stride; | |
664 | |
665 vis_xor(REF_0, REF_2, TMP12); | |
666 | |
667 vis_and(TMP12, MASK_fe, TMP12); | |
668 | |
669 vis_or(REF_0, REF_2, TMP14); | |
670 vis_mul8x16(CONST_128, TMP12, TMP12); | |
671 | |
672 vis_alignaddr_g0((void *)off); | |
673 vis_faligndata(TMP8, TMP10, REF_0); | |
674 if (off != 0x7) { | |
675 vis_alignaddr_g0((void *)off_plus_1); | |
676 vis_faligndata(TMP8, TMP10, REF_2); | |
677 } else { | |
678 vis_src1(TMP10, REF_2); | |
679 } | |
680 | |
681 vis_and(TMP12, MASK_7f, TMP12); | |
682 | |
683 vis_psub16(TMP14, TMP12, DST_0); | |
684 vis_st64(DST_0, dest[0]); | |
685 dest += stride; | |
686 } while (--height); | |
687 | |
688 vis_ld64(ref[0], TMP0); | |
689 vis_xor(REF_0, REF_2, TMP4); | |
690 | |
691 vis_ld64_2(ref, 8, TMP2); | |
692 vis_and(TMP4, MASK_fe, TMP4); | |
693 | |
694 vis_or(REF_0, REF_2, TMP6); | |
695 vis_mul8x16(CONST_128, TMP4, TMP4); | |
696 | |
697 vis_alignaddr_g0((void *)off); | |
698 | |
699 vis_faligndata(TMP0, TMP2, REF_0); | |
700 | |
701 if (off != 0x7) { | |
702 vis_alignaddr_g0((void *)off_plus_1); | |
703 vis_faligndata(TMP0, TMP2, REF_2); | |
704 } else { | |
705 vis_src1(TMP2, REF_2); | |
706 } | |
707 | |
708 vis_and(TMP4, MASK_7f, TMP4); | |
709 | |
710 vis_psub16(TMP6, TMP4, DST_0); | |
711 vis_st64(DST_0, dest[0]); | |
712 dest += stride; | |
713 | |
714 vis_xor(REF_0, REF_2, TMP12); | |
715 | |
716 vis_and(TMP12, MASK_fe, TMP12); | |
717 | |
718 vis_or(REF_0, REF_2, TMP14); | |
719 vis_mul8x16(CONST_128, TMP12, TMP12); | |
720 | |
721 vis_and(TMP12, MASK_7f, TMP12); | |
722 | |
723 vis_psub16(TMP14, TMP12, DST_0); | |
724 vis_st64(DST_0, dest[0]); | |
725 dest += stride; | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
726 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
727 |
11458 | 728 static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 729 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
730 { |
2979 | 731 unsigned long off = (unsigned long) ref & 0x7; |
732 unsigned long off_plus_1 = off + 1; | |
733 | |
734 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
735 | |
736 vis_ld64(constants3[0], CONST_3); | |
737 vis_fzero(ZERO); | |
738 vis_ld64(constants256_512[0], CONST_256); | |
739 | |
740 ref = vis_alignaddr(ref); | |
741 do { /* 26 cycles */ | |
742 vis_ld64(ref[0], TMP0); | |
743 | |
744 vis_ld64(ref[8], TMP2); | |
745 | |
746 vis_alignaddr_g0((void *)off); | |
747 | |
748 vis_ld64(ref[16], TMP4); | |
749 | |
750 vis_ld64(dest[0], DST_0); | |
751 vis_faligndata(TMP0, TMP2, REF_0); | |
752 | |
753 vis_ld64(dest[8], DST_2); | |
754 vis_faligndata(TMP2, TMP4, REF_4); | |
755 | |
756 if (off != 0x7) { | |
757 vis_alignaddr_g0((void *)off_plus_1); | |
758 vis_faligndata(TMP0, TMP2, REF_2); | |
759 vis_faligndata(TMP2, TMP4, REF_6); | |
760 } else { | |
761 vis_src1(TMP2, REF_2); | |
762 vis_src1(TMP4, REF_6); | |
763 } | |
764 | |
765 vis_mul8x16au(REF_0, CONST_256, TMP0); | |
766 | |
767 vis_pmerge(ZERO, REF_2, TMP4); | |
768 vis_mul8x16au(REF_0_1, CONST_256, TMP2); | |
769 | |
770 vis_pmerge(ZERO, REF_2_1, TMP6); | |
771 | |
772 vis_padd16(TMP0, TMP4, TMP0); | |
773 | |
774 vis_mul8x16al(DST_0, CONST_512, TMP4); | |
775 vis_padd16(TMP2, TMP6, TMP2); | |
776 | |
777 vis_mul8x16al(DST_1, CONST_512, TMP6); | |
778 | |
779 vis_mul8x16au(REF_6, CONST_256, TMP12); | |
780 | |
781 vis_padd16(TMP0, TMP4, TMP0); | |
782 vis_mul8x16au(REF_6_1, CONST_256, TMP14); | |
783 | |
784 vis_padd16(TMP2, TMP6, TMP2); | |
785 vis_mul8x16au(REF_4, CONST_256, TMP16); | |
786 | |
787 vis_padd16(TMP0, CONST_3, TMP8); | |
788 vis_mul8x16au(REF_4_1, CONST_256, TMP18); | |
789 | |
790 vis_padd16(TMP2, CONST_3, TMP10); | |
791 vis_pack16(TMP8, DST_0); | |
792 | |
793 vis_pack16(TMP10, DST_1); | |
794 vis_padd16(TMP16, TMP12, TMP0); | |
795 | |
796 vis_st64(DST_0, dest[0]); | |
797 vis_mul8x16al(DST_2, CONST_512, TMP4); | |
798 vis_padd16(TMP18, TMP14, TMP2); | |
799 | |
800 vis_mul8x16al(DST_3, CONST_512, TMP6); | |
801 vis_padd16(TMP0, CONST_3, TMP0); | |
802 | |
803 vis_padd16(TMP2, CONST_3, TMP2); | |
804 | |
805 vis_padd16(TMP0, TMP4, TMP0); | |
806 | |
807 vis_padd16(TMP2, TMP6, TMP2); | |
808 vis_pack16(TMP0, DST_2); | |
809 | |
810 vis_pack16(TMP2, DST_3); | |
811 vis_st64(DST_2, dest[8]); | |
812 | |
813 ref += stride; | |
814 dest += stride; | |
815 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
816 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
817 |
11458 | 818 static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 819 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
820 { |
2979 | 821 unsigned long off = (unsigned long) ref & 0x7; |
822 unsigned long off_plus_1 = off + 1; | |
823 int stride_times_2 = stride << 1; | |
824 | |
825 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
826 | |
827 vis_ld64(constants3[0], CONST_3); | |
828 vis_fzero(ZERO); | |
829 vis_ld64(constants256_512[0], CONST_256); | |
830 | |
831 ref = vis_alignaddr(ref); | |
832 height >>= 2; | |
833 do { /* 47 cycles */ | |
834 vis_ld64(ref[0], TMP0); | |
835 | |
836 vis_ld64_2(ref, 8, TMP2); | |
837 ref += stride; | |
838 | |
839 vis_alignaddr_g0((void *)off); | |
840 | |
841 vis_ld64(ref[0], TMP4); | |
842 vis_faligndata(TMP0, TMP2, REF_0); | |
843 | |
844 vis_ld64_2(ref, 8, TMP6); | |
845 ref += stride; | |
846 | |
847 vis_ld64(ref[0], TMP8); | |
848 | |
849 vis_ld64_2(ref, 8, TMP10); | |
850 ref += stride; | |
851 vis_faligndata(TMP4, TMP6, REF_4); | |
852 | |
853 vis_ld64(ref[0], TMP12); | |
854 | |
855 vis_ld64_2(ref, 8, TMP14); | |
856 ref += stride; | |
857 vis_faligndata(TMP8, TMP10, REF_S0); | |
858 | |
859 vis_faligndata(TMP12, TMP14, REF_S4); | |
860 | |
861 if (off != 0x7) { | |
862 vis_alignaddr_g0((void *)off_plus_1); | |
863 | |
864 vis_ld64(dest[0], DST_0); | |
865 vis_faligndata(TMP0, TMP2, REF_2); | |
866 | |
867 vis_ld64_2(dest, stride, DST_2); | |
868 vis_faligndata(TMP4, TMP6, REF_6); | |
869 | |
870 vis_faligndata(TMP8, TMP10, REF_S2); | |
871 | |
872 vis_faligndata(TMP12, TMP14, REF_S6); | |
873 } else { | |
874 vis_ld64(dest[0], DST_0); | |
875 vis_src1(TMP2, REF_2); | |
876 | |
877 vis_ld64_2(dest, stride, DST_2); | |
878 vis_src1(TMP6, REF_6); | |
879 | |
880 vis_src1(TMP10, REF_S2); | |
881 | |
882 vis_src1(TMP14, REF_S6); | |
883 } | |
884 | |
885 vis_pmerge(ZERO, REF_0, TMP0); | |
886 vis_mul8x16au(REF_0_1, CONST_256, TMP2); | |
887 | |
888 vis_pmerge(ZERO, REF_2, TMP4); | |
889 vis_mul8x16au(REF_2_1, CONST_256, TMP6); | |
890 | |
891 vis_padd16(TMP0, CONST_3, TMP0); | |
892 vis_mul8x16al(DST_0, CONST_512, TMP16); | |
893 | |
894 vis_padd16(TMP2, CONST_3, TMP2); | |
895 vis_mul8x16al(DST_1, CONST_512, TMP18); | |
896 | |
897 vis_padd16(TMP0, TMP4, TMP0); | |
898 vis_mul8x16au(REF_4, CONST_256, TMP8); | |
899 | |
900 vis_padd16(TMP2, TMP6, TMP2); | |
901 vis_mul8x16au(REF_4_1, CONST_256, TMP10); | |
902 | |
903 vis_padd16(TMP0, TMP16, TMP0); | |
904 vis_mul8x16au(REF_6, CONST_256, TMP12); | |
905 | |
906 vis_padd16(TMP2, TMP18, TMP2); | |
907 vis_mul8x16au(REF_6_1, CONST_256, TMP14); | |
908 | |
909 vis_padd16(TMP8, CONST_3, TMP8); | |
910 vis_mul8x16al(DST_2, CONST_512, TMP16); | |
911 | |
912 vis_padd16(TMP8, TMP12, TMP8); | |
913 vis_mul8x16al(DST_3, CONST_512, TMP18); | |
914 | |
915 vis_padd16(TMP10, TMP14, TMP10); | |
916 vis_pack16(TMP0, DST_0); | |
917 | |
918 vis_pack16(TMP2, DST_1); | |
919 vis_st64(DST_0, dest[0]); | |
920 dest += stride; | |
921 vis_padd16(TMP10, CONST_3, TMP10); | |
922 | |
923 vis_ld64_2(dest, stride, DST_0); | |
924 vis_padd16(TMP8, TMP16, TMP8); | |
925 | |
926 vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/); | |
927 vis_padd16(TMP10, TMP18, TMP10); | |
928 vis_pack16(TMP8, DST_2); | |
929 | |
930 vis_pack16(TMP10, DST_3); | |
931 vis_st64(DST_2, dest[0]); | |
932 dest += stride; | |
933 | |
934 vis_mul8x16au(REF_S0_1, CONST_256, TMP2); | |
935 vis_pmerge(ZERO, REF_S0, TMP0); | |
936 | |
937 vis_pmerge(ZERO, REF_S2, TMP24); | |
938 vis_mul8x16au(REF_S2_1, CONST_256, TMP6); | |
939 | |
940 vis_padd16(TMP0, CONST_3, TMP0); | |
941 vis_mul8x16au(REF_S4, CONST_256, TMP8); | |
942 | |
943 vis_padd16(TMP2, CONST_3, TMP2); | |
944 vis_mul8x16au(REF_S4_1, CONST_256, TMP10); | |
945 | |
946 vis_padd16(TMP0, TMP24, TMP0); | |
947 vis_mul8x16au(REF_S6, CONST_256, TMP12); | |
948 | |
949 vis_padd16(TMP2, TMP6, TMP2); | |
950 vis_mul8x16au(REF_S6_1, CONST_256, TMP14); | |
951 | |
952 vis_padd16(TMP8, CONST_3, TMP8); | |
953 vis_mul8x16al(DST_0, CONST_512, TMP16); | |
954 | |
955 vis_padd16(TMP10, CONST_3, TMP10); | |
956 vis_mul8x16al(DST_1, CONST_512, TMP18); | |
957 | |
958 vis_padd16(TMP8, TMP12, TMP8); | |
959 vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20); | |
960 | |
961 vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22); | |
962 vis_padd16(TMP0, TMP16, TMP0); | |
963 | |
964 vis_padd16(TMP2, TMP18, TMP2); | |
965 vis_pack16(TMP0, DST_0); | |
966 | |
967 vis_padd16(TMP10, TMP14, TMP10); | |
968 vis_pack16(TMP2, DST_1); | |
969 vis_st64(DST_0, dest[0]); | |
970 dest += stride; | |
971 | |
972 vis_padd16(TMP8, TMP20, TMP8); | |
973 | |
974 vis_padd16(TMP10, TMP22, TMP10); | |
975 vis_pack16(TMP8, DST_2); | |
976 | |
977 vis_pack16(TMP10, DST_3); | |
978 vis_st64(DST_2, dest[0]); | |
979 dest += stride; | |
980 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
981 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
982 |
11458 | 983 static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 984 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
985 { |
2979 | 986 ref = vis_alignaddr(ref); |
987 vis_ld64(ref[0], TMP0); | |
988 | |
989 vis_ld64_2(ref, 8, TMP2); | |
990 | |
991 vis_ld64_2(ref, 16, TMP4); | |
992 ref += stride; | |
993 | |
994 vis_ld64(ref[0], TMP6); | |
995 vis_faligndata(TMP0, TMP2, REF_0); | |
996 | |
997 vis_ld64_2(ref, 8, TMP8); | |
998 vis_faligndata(TMP2, TMP4, REF_4); | |
999 | |
1000 vis_ld64_2(ref, 16, TMP10); | |
1001 ref += stride; | |
1002 | |
1003 vis_ld64(constants_fe[0], MASK_fe); | |
1004 vis_faligndata(TMP6, TMP8, REF_2); | |
1005 | |
1006 vis_ld64(constants_7f[0], MASK_7f); | |
1007 vis_faligndata(TMP8, TMP10, REF_6); | |
1008 | |
1009 vis_ld64(constants128[0], CONST_128); | |
1010 height = (height >> 1) - 1; | |
1011 do { /* 24 cycles */ | |
1012 vis_ld64(ref[0], TMP0); | |
1013 vis_xor(REF_0, REF_2, TMP12); | |
1014 | |
1015 vis_ld64_2(ref, 8, TMP2); | |
1016 vis_xor(REF_4, REF_6, TMP16); | |
1017 | |
1018 vis_ld64_2(ref, 16, TMP4); | |
1019 ref += stride; | |
1020 vis_or(REF_0, REF_2, TMP14); | |
1021 | |
1022 vis_ld64(ref[0], TMP6); | |
1023 vis_or(REF_4, REF_6, TMP18); | |
1024 | |
1025 vis_ld64_2(ref, 8, TMP8); | |
1026 vis_faligndata(TMP0, TMP2, REF_0); | |
1027 | |
1028 vis_ld64_2(ref, 16, TMP10); | |
1029 ref += stride; | |
1030 vis_faligndata(TMP2, TMP4, REF_4); | |
1031 | |
1032 vis_and(TMP12, MASK_fe, TMP12); | |
1033 | |
1034 vis_and(TMP16, MASK_fe, TMP16); | |
1035 vis_mul8x16(CONST_128, TMP12, TMP12); | |
1036 | |
1037 vis_mul8x16(CONST_128, TMP16, TMP16); | |
1038 vis_xor(REF_0, REF_2, TMP0); | |
1039 | |
1040 vis_xor(REF_4, REF_6, TMP2); | |
1041 | |
1042 vis_or(REF_0, REF_2, TMP20); | |
1043 | |
1044 vis_and(TMP12, MASK_7f, TMP12); | |
1045 | |
1046 vis_and(TMP16, MASK_7f, TMP16); | |
1047 | |
1048 vis_psub16(TMP14, TMP12, TMP12); | |
1049 vis_st64(TMP12, dest[0]); | |
1050 | |
1051 vis_psub16(TMP18, TMP16, TMP16); | |
1052 vis_st64_2(TMP16, dest, 8); | |
1053 dest += stride; | |
1054 | |
1055 vis_or(REF_4, REF_6, TMP18); | |
1056 | |
1057 vis_and(TMP0, MASK_fe, TMP0); | |
1058 | |
1059 vis_and(TMP2, MASK_fe, TMP2); | |
1060 vis_mul8x16(CONST_128, TMP0, TMP0); | |
1061 | |
1062 vis_faligndata(TMP6, TMP8, REF_2); | |
1063 vis_mul8x16(CONST_128, TMP2, TMP2); | |
1064 | |
1065 vis_faligndata(TMP8, TMP10, REF_6); | |
1066 | |
1067 vis_and(TMP0, MASK_7f, TMP0); | |
1068 | |
1069 vis_and(TMP2, MASK_7f, TMP2); | |
1070 | |
1071 vis_psub16(TMP20, TMP0, TMP0); | |
1072 vis_st64(TMP0, dest[0]); | |
1073 | |
1074 vis_psub16(TMP18, TMP2, TMP2); | |
1075 vis_st64_2(TMP2, dest, 8); | |
1076 dest += stride; | |
1077 } while (--height); | |
1078 | |
1079 vis_ld64(ref[0], TMP0); | |
1080 vis_xor(REF_0, REF_2, TMP12); | |
1081 | |
1082 vis_ld64_2(ref, 8, TMP2); | |
1083 vis_xor(REF_4, REF_6, TMP16); | |
1084 | |
1085 vis_ld64_2(ref, 16, TMP4); | |
1086 vis_or(REF_0, REF_2, TMP14); | |
1087 | |
1088 vis_or(REF_4, REF_6, TMP18); | |
1089 | |
1090 vis_faligndata(TMP0, TMP2, REF_0); | |
1091 | |
1092 vis_faligndata(TMP2, TMP4, REF_4); | |
1093 | |
1094 vis_and(TMP12, MASK_fe, TMP12); | |
1095 | |
1096 vis_and(TMP16, MASK_fe, TMP16); | |
1097 vis_mul8x16(CONST_128, TMP12, TMP12); | |
1098 | |
1099 vis_mul8x16(CONST_128, TMP16, TMP16); | |
1100 vis_xor(REF_0, REF_2, TMP0); | |
1101 | |
1102 vis_xor(REF_4, REF_6, TMP2); | |
1103 | |
1104 vis_or(REF_0, REF_2, TMP20); | |
1105 | |
1106 vis_and(TMP12, MASK_7f, TMP12); | |
1107 | |
1108 vis_and(TMP16, MASK_7f, TMP16); | |
1109 | |
1110 vis_psub16(TMP14, TMP12, TMP12); | |
1111 vis_st64(TMP12, dest[0]); | |
1112 | |
1113 vis_psub16(TMP18, TMP16, TMP16); | |
1114 vis_st64_2(TMP16, dest, 8); | |
1115 dest += stride; | |
1116 | |
1117 vis_or(REF_4, REF_6, TMP18); | |
1118 | |
1119 vis_and(TMP0, MASK_fe, TMP0); | |
1120 | |
1121 vis_and(TMP2, MASK_fe, TMP2); | |
1122 vis_mul8x16(CONST_128, TMP0, TMP0); | |
1123 | |
1124 vis_mul8x16(CONST_128, TMP2, TMP2); | |
1125 | |
1126 vis_and(TMP0, MASK_7f, TMP0); | |
1127 | |
1128 vis_and(TMP2, MASK_7f, TMP2); | |
1129 | |
1130 vis_psub16(TMP20, TMP0, TMP0); | |
1131 vis_st64(TMP0, dest[0]); | |
1132 | |
1133 vis_psub16(TMP18, TMP2, TMP2); | |
1134 vis_st64_2(TMP2, dest, 8); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1135 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1136 |
11458 | 1137 static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 1138 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1139 { |
2979 | 1140 ref = vis_alignaddr(ref); |
1141 vis_ld64(ref[0], TMP0); | |
1142 | |
1143 vis_ld64_2(ref, 8, TMP2); | |
1144 ref += stride; | |
1145 | |
1146 vis_ld64(ref[0], TMP4); | |
1147 | |
1148 vis_ld64_2(ref, 8, TMP6); | |
1149 ref += stride; | |
1150 | |
1151 vis_ld64(constants_fe[0], MASK_fe); | |
1152 vis_faligndata(TMP0, TMP2, REF_0); | |
1153 | |
1154 vis_ld64(constants_7f[0], MASK_7f); | |
1155 vis_faligndata(TMP4, TMP6, REF_2); | |
1156 | |
1157 vis_ld64(constants128[0], CONST_128); | |
1158 height = (height >> 1) - 1; | |
1159 do { /* 12 cycles */ | |
1160 vis_ld64(ref[0], TMP0); | |
1161 vis_xor(REF_0, REF_2, TMP4); | |
1162 | |
1163 vis_ld64_2(ref, 8, TMP2); | |
1164 ref += stride; | |
1165 vis_and(TMP4, MASK_fe, TMP4); | |
1166 | |
1167 vis_or(REF_0, REF_2, TMP6); | |
1168 vis_mul8x16(CONST_128, TMP4, TMP4); | |
1169 | |
1170 vis_faligndata(TMP0, TMP2, REF_0); | |
1171 vis_ld64(ref[0], TMP0); | |
1172 | |
1173 vis_ld64_2(ref, 8, TMP2); | |
1174 ref += stride; | |
1175 vis_xor(REF_0, REF_2, TMP12); | |
1176 | |
1177 vis_and(TMP4, MASK_7f, TMP4); | |
1178 | |
1179 vis_and(TMP12, MASK_fe, TMP12); | |
1180 | |
1181 vis_mul8x16(CONST_128, TMP12, TMP12); | |
1182 vis_or(REF_0, REF_2, TMP14); | |
1183 | |
1184 vis_psub16(TMP6, TMP4, DST_0); | |
1185 vis_st64(DST_0, dest[0]); | |
1186 dest += stride; | |
1187 | |
1188 vis_faligndata(TMP0, TMP2, REF_2); | |
1189 | |
1190 vis_and(TMP12, MASK_7f, TMP12); | |
1191 | |
1192 vis_psub16(TMP14, TMP12, DST_0); | |
1193 vis_st64(DST_0, dest[0]); | |
1194 dest += stride; | |
1195 } while (--height); | |
1196 | |
1197 vis_ld64(ref[0], TMP0); | |
1198 vis_xor(REF_0, REF_2, TMP4); | |
1199 | |
1200 vis_ld64_2(ref, 8, TMP2); | |
1201 vis_and(TMP4, MASK_fe, TMP4); | |
1202 | |
1203 vis_or(REF_0, REF_2, TMP6); | |
1204 vis_mul8x16(CONST_128, TMP4, TMP4); | |
1205 | |
1206 vis_faligndata(TMP0, TMP2, REF_0); | |
1207 | |
1208 vis_xor(REF_0, REF_2, TMP12); | |
1209 | |
1210 vis_and(TMP4, MASK_7f, TMP4); | |
1211 | |
1212 vis_and(TMP12, MASK_fe, TMP12); | |
1213 | |
1214 vis_mul8x16(CONST_128, TMP12, TMP12); | |
1215 vis_or(REF_0, REF_2, TMP14); | |
1216 | |
1217 vis_psub16(TMP6, TMP4, DST_0); | |
1218 vis_st64(DST_0, dest[0]); | |
1219 dest += stride; | |
1220 | |
1221 vis_and(TMP12, MASK_7f, TMP12); | |
1222 | |
1223 vis_psub16(TMP14, TMP12, DST_0); | |
1224 vis_st64(DST_0, dest[0]); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1225 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1226 |
11458 | 1227 static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 1228 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1229 { |
2979 | 1230 int stride_8 = stride + 8; |
1231 int stride_16 = stride + 16; | |
1232 | |
1233 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
1234 | |
1235 ref = vis_alignaddr(ref); | |
1236 | |
1237 vis_ld64(ref[ 0], TMP0); | |
1238 vis_fzero(ZERO); | |
1239 | |
1240 vis_ld64(ref[ 8], TMP2); | |
1241 | |
1242 vis_ld64(ref[16], TMP4); | |
1243 | |
1244 vis_ld64(constants3[0], CONST_3); | |
1245 vis_faligndata(TMP0, TMP2, REF_2); | |
1246 | |
1247 vis_ld64(constants256_512[0], CONST_256); | |
1248 vis_faligndata(TMP2, TMP4, REF_6); | |
1249 height >>= 1; | |
1250 | |
1251 do { /* 31 cycles */ | |
1252 vis_ld64_2(ref, stride, TMP0); | |
1253 vis_pmerge(ZERO, REF_2, TMP12); | |
1254 vis_mul8x16au(REF_2_1, CONST_256, TMP14); | |
1255 | |
1256 vis_ld64_2(ref, stride_8, TMP2); | |
1257 vis_pmerge(ZERO, REF_6, TMP16); | |
1258 vis_mul8x16au(REF_6_1, CONST_256, TMP18); | |
1259 | |
1260 vis_ld64_2(ref, stride_16, TMP4); | |
1261 ref += stride; | |
1262 | |
1263 vis_ld64(dest[0], DST_0); | |
1264 vis_faligndata(TMP0, TMP2, REF_0); | |
1265 | |
1266 vis_ld64_2(dest, 8, DST_2); | |
1267 vis_faligndata(TMP2, TMP4, REF_4); | |
1268 | |
1269 vis_ld64_2(ref, stride, TMP6); | |
1270 vis_pmerge(ZERO, REF_0, TMP0); | |
1271 vis_mul8x16au(REF_0_1, CONST_256, TMP2); | |
1272 | |
1273 vis_ld64_2(ref, stride_8, TMP8); | |
1274 vis_pmerge(ZERO, REF_4, TMP4); | |
1275 | |
1276 vis_ld64_2(ref, stride_16, TMP10); | |
1277 ref += stride; | |
1278 | |
1279 vis_ld64_2(dest, stride, REF_S0/*DST_4*/); | |
1280 vis_faligndata(TMP6, TMP8, REF_2); | |
1281 vis_mul8x16au(REF_4_1, CONST_256, TMP6); | |
1282 | |
1283 vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/); | |
1284 vis_faligndata(TMP8, TMP10, REF_6); | |
1285 vis_mul8x16al(DST_0, CONST_512, TMP20); | |
1286 | |
1287 vis_padd16(TMP0, CONST_3, TMP0); | |
1288 vis_mul8x16al(DST_1, CONST_512, TMP22); | |
1289 | |
1290 vis_padd16(TMP2, CONST_3, TMP2); | |
1291 vis_mul8x16al(DST_2, CONST_512, TMP24); | |
1292 | |
1293 vis_padd16(TMP4, CONST_3, TMP4); | |
1294 vis_mul8x16al(DST_3, CONST_512, TMP26); | |
1295 | |
1296 vis_padd16(TMP6, CONST_3, TMP6); | |
1297 | |
1298 vis_padd16(TMP12, TMP20, TMP12); | |
1299 vis_mul8x16al(REF_S0, CONST_512, TMP20); | |
1300 | |
1301 vis_padd16(TMP14, TMP22, TMP14); | |
1302 vis_mul8x16al(REF_S0_1, CONST_512, TMP22); | |
1303 | |
1304 vis_padd16(TMP16, TMP24, TMP16); | |
1305 vis_mul8x16al(REF_S2, CONST_512, TMP24); | |
1306 | |
1307 vis_padd16(TMP18, TMP26, TMP18); | |
1308 vis_mul8x16al(REF_S2_1, CONST_512, TMP26); | |
1309 | |
1310 vis_padd16(TMP12, TMP0, TMP12); | |
1311 vis_mul8x16au(REF_2, CONST_256, TMP28); | |
1312 | |
1313 vis_padd16(TMP14, TMP2, TMP14); | |
1314 vis_mul8x16au(REF_2_1, CONST_256, TMP30); | |
1315 | |
1316 vis_padd16(TMP16, TMP4, TMP16); | |
1317 vis_mul8x16au(REF_6, CONST_256, REF_S4); | |
1318 | |
1319 vis_padd16(TMP18, TMP6, TMP18); | |
1320 vis_mul8x16au(REF_6_1, CONST_256, REF_S6); | |
1321 | |
1322 vis_pack16(TMP12, DST_0); | |
1323 vis_padd16(TMP28, TMP0, TMP12); | |
1324 | |
1325 vis_pack16(TMP14, DST_1); | |
1326 vis_st64(DST_0, dest[0]); | |
1327 vis_padd16(TMP30, TMP2, TMP14); | |
1328 | |
1329 vis_pack16(TMP16, DST_2); | |
1330 vis_padd16(REF_S4, TMP4, TMP16); | |
1331 | |
1332 vis_pack16(TMP18, DST_3); | |
1333 vis_st64_2(DST_2, dest, 8); | |
1334 dest += stride; | |
1335 vis_padd16(REF_S6, TMP6, TMP18); | |
1336 | |
1337 vis_padd16(TMP12, TMP20, TMP12); | |
1338 | |
1339 vis_padd16(TMP14, TMP22, TMP14); | |
1340 vis_pack16(TMP12, DST_0); | |
1341 | |
1342 vis_padd16(TMP16, TMP24, TMP16); | |
1343 vis_pack16(TMP14, DST_1); | |
1344 vis_st64(DST_0, dest[0]); | |
1345 | |
1346 vis_padd16(TMP18, TMP26, TMP18); | |
1347 vis_pack16(TMP16, DST_2); | |
1348 | |
1349 vis_pack16(TMP18, DST_3); | |
1350 vis_st64_2(DST_2, dest, 8); | |
1351 dest += stride; | |
1352 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1353 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1354 |
11458 | 1355 static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 1356 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1357 { |
2979 | 1358 int stride_8 = stride + 8; |
1359 | |
1360 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
1361 | |
1362 ref = vis_alignaddr(ref); | |
1363 | |
1364 vis_ld64(ref[ 0], TMP0); | |
1365 vis_fzero(ZERO); | |
1366 | |
1367 vis_ld64(ref[ 8], TMP2); | |
1368 | |
1369 vis_ld64(constants3[0], CONST_3); | |
1370 vis_faligndata(TMP0, TMP2, REF_2); | |
1371 | |
1372 vis_ld64(constants256_512[0], CONST_256); | |
1373 | |
1374 height >>= 1; | |
1375 do { /* 20 cycles */ | |
1376 vis_ld64_2(ref, stride, TMP0); | |
1377 vis_pmerge(ZERO, REF_2, TMP8); | |
1378 vis_mul8x16au(REF_2_1, CONST_256, TMP10); | |
1379 | |
1380 vis_ld64_2(ref, stride_8, TMP2); | |
1381 ref += stride; | |
1382 | |
1383 vis_ld64(dest[0], DST_0); | |
1384 | |
1385 vis_ld64_2(dest, stride, DST_2); | |
1386 vis_faligndata(TMP0, TMP2, REF_0); | |
1387 | |
1388 vis_ld64_2(ref, stride, TMP4); | |
1389 vis_mul8x16al(DST_0, CONST_512, TMP16); | |
1390 vis_pmerge(ZERO, REF_0, TMP12); | |
1391 | |
1392 vis_ld64_2(ref, stride_8, TMP6); | |
1393 ref += stride; | |
1394 vis_mul8x16al(DST_1, CONST_512, TMP18); | |
1395 vis_pmerge(ZERO, REF_0_1, TMP14); | |
1396 | |
1397 vis_padd16(TMP12, CONST_3, TMP12); | |
1398 vis_mul8x16al(DST_2, CONST_512, TMP24); | |
1399 | |
1400 vis_padd16(TMP14, CONST_3, TMP14); | |
1401 vis_mul8x16al(DST_3, CONST_512, TMP26); | |
1402 | |
1403 vis_faligndata(TMP4, TMP6, REF_2); | |
1404 | |
1405 vis_padd16(TMP8, TMP12, TMP8); | |
1406 | |
1407 vis_padd16(TMP10, TMP14, TMP10); | |
1408 vis_mul8x16au(REF_2, CONST_256, TMP20); | |
1409 | |
1410 vis_padd16(TMP8, TMP16, TMP0); | |
1411 vis_mul8x16au(REF_2_1, CONST_256, TMP22); | |
1412 | |
1413 vis_padd16(TMP10, TMP18, TMP2); | |
1414 vis_pack16(TMP0, DST_0); | |
1415 | |
1416 vis_pack16(TMP2, DST_1); | |
1417 vis_st64(DST_0, dest[0]); | |
1418 dest += stride; | |
1419 vis_padd16(TMP12, TMP20, TMP12); | |
1420 | |
1421 vis_padd16(TMP14, TMP22, TMP14); | |
1422 | |
1423 vis_padd16(TMP12, TMP24, TMP0); | |
1424 | |
1425 vis_padd16(TMP14, TMP26, TMP2); | |
1426 vis_pack16(TMP0, DST_2); | |
1427 | |
1428 vis_pack16(TMP2, DST_3); | |
1429 vis_st64(DST_2, dest[0]); | |
1430 dest += stride; | |
1431 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1432 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1433 |
11458 | 1434 static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 1435 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1436 { |
2979 | 1437 unsigned long off = (unsigned long) ref & 0x7; |
1438 unsigned long off_plus_1 = off + 1; | |
1439 int stride_8 = stride + 8; | |
1440 int stride_16 = stride + 16; | |
1441 | |
1442 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
1443 | |
1444 ref = vis_alignaddr(ref); | |
1445 | |
1446 vis_ld64(ref[ 0], TMP0); | |
1447 vis_fzero(ZERO); | |
1448 | |
1449 vis_ld64(ref[ 8], TMP2); | |
1450 | |
1451 vis_ld64(ref[16], TMP4); | |
1452 | |
1453 vis_ld64(constants2[0], CONST_2); | |
1454 vis_faligndata(TMP0, TMP2, REF_S0); | |
1455 | |
1456 vis_ld64(constants256_512[0], CONST_256); | |
1457 vis_faligndata(TMP2, TMP4, REF_S4); | |
1458 | |
1459 if (off != 0x7) { | |
1460 vis_alignaddr_g0((void *)off_plus_1); | |
1461 vis_faligndata(TMP0, TMP2, REF_S2); | |
1462 vis_faligndata(TMP2, TMP4, REF_S6); | |
1463 } else { | |
1464 vis_src1(TMP2, REF_S2); | |
1465 vis_src1(TMP4, REF_S6); | |
1466 } | |
1467 | |
1468 height >>= 1; | |
1469 do { | |
1470 vis_ld64_2(ref, stride, TMP0); | |
1471 vis_mul8x16au(REF_S0, CONST_256, TMP12); | |
1472 vis_pmerge(ZERO, REF_S0_1, TMP14); | |
1473 | |
1474 vis_alignaddr_g0((void *)off); | |
1475 | |
1476 vis_ld64_2(ref, stride_8, TMP2); | |
1477 vis_mul8x16au(REF_S2, CONST_256, TMP16); | |
1478 vis_pmerge(ZERO, REF_S2_1, TMP18); | |
1479 | |
1480 vis_ld64_2(ref, stride_16, TMP4); | |
1481 ref += stride; | |
1482 vis_mul8x16au(REF_S4, CONST_256, TMP20); | |
1483 vis_pmerge(ZERO, REF_S4_1, TMP22); | |
1484 | |
1485 vis_ld64_2(ref, stride, TMP6); | |
1486 vis_mul8x16au(REF_S6, CONST_256, TMP24); | |
1487 vis_pmerge(ZERO, REF_S6_1, TMP26); | |
1488 | |
1489 vis_ld64_2(ref, stride_8, TMP8); | |
1490 vis_faligndata(TMP0, TMP2, REF_0); | |
1491 | |
1492 vis_ld64_2(ref, stride_16, TMP10); | |
1493 ref += stride; | |
1494 vis_faligndata(TMP2, TMP4, REF_4); | |
1495 | |
1496 vis_faligndata(TMP6, TMP8, REF_S0); | |
1497 | |
1498 vis_faligndata(TMP8, TMP10, REF_S4); | |
1499 | |
1500 if (off != 0x7) { | |
1501 vis_alignaddr_g0((void *)off_plus_1); | |
1502 vis_faligndata(TMP0, TMP2, REF_2); | |
1503 vis_faligndata(TMP2, TMP4, REF_6); | |
1504 vis_faligndata(TMP6, TMP8, REF_S2); | |
1505 vis_faligndata(TMP8, TMP10, REF_S6); | |
1506 } else { | |
1507 vis_src1(TMP2, REF_2); | |
1508 vis_src1(TMP4, REF_6); | |
1509 vis_src1(TMP8, REF_S2); | |
1510 vis_src1(TMP10, REF_S6); | |
1511 } | |
1512 | |
1513 vis_mul8x16au(REF_0, CONST_256, TMP0); | |
1514 vis_pmerge(ZERO, REF_0_1, TMP2); | |
1515 | |
1516 vis_mul8x16au(REF_2, CONST_256, TMP4); | |
1517 vis_pmerge(ZERO, REF_2_1, TMP6); | |
1518 | |
1519 vis_padd16(TMP0, CONST_2, TMP8); | |
1520 vis_mul8x16au(REF_4, CONST_256, TMP0); | |
1521 | |
1522 vis_padd16(TMP2, CONST_2, TMP10); | |
1523 vis_mul8x16au(REF_4_1, CONST_256, TMP2); | |
1524 | |
1525 vis_padd16(TMP8, TMP4, TMP8); | |
1526 vis_mul8x16au(REF_6, CONST_256, TMP4); | |
1527 | |
1528 vis_padd16(TMP10, TMP6, TMP10); | |
1529 vis_mul8x16au(REF_6_1, CONST_256, TMP6); | |
1530 | |
1531 vis_padd16(TMP12, TMP8, TMP12); | |
1532 | |
1533 vis_padd16(TMP14, TMP10, TMP14); | |
1534 | |
1535 vis_padd16(TMP12, TMP16, TMP12); | |
1536 | |
1537 vis_padd16(TMP14, TMP18, TMP14); | |
1538 vis_pack16(TMP12, DST_0); | |
1539 | |
1540 vis_pack16(TMP14, DST_1); | |
1541 vis_st64(DST_0, dest[0]); | |
1542 vis_padd16(TMP0, CONST_2, TMP12); | |
1543 | |
1544 vis_mul8x16au(REF_S0, CONST_256, TMP0); | |
1545 vis_padd16(TMP2, CONST_2, TMP14); | |
1546 | |
1547 vis_mul8x16au(REF_S0_1, CONST_256, TMP2); | |
1548 vis_padd16(TMP12, TMP4, TMP12); | |
1549 | |
1550 vis_mul8x16au(REF_S2, CONST_256, TMP4); | |
1551 vis_padd16(TMP14, TMP6, TMP14); | |
1552 | |
1553 vis_mul8x16au(REF_S2_1, CONST_256, TMP6); | |
1554 vis_padd16(TMP20, TMP12, TMP20); | |
1555 | |
1556 vis_padd16(TMP22, TMP14, TMP22); | |
1557 | |
1558 vis_padd16(TMP20, TMP24, TMP20); | |
1559 | |
1560 vis_padd16(TMP22, TMP26, TMP22); | |
1561 vis_pack16(TMP20, DST_2); | |
1562 | |
1563 vis_pack16(TMP22, DST_3); | |
1564 vis_st64_2(DST_2, dest, 8); | |
1565 dest += stride; | |
1566 vis_padd16(TMP0, TMP4, TMP24); | |
1567 | |
1568 vis_mul8x16au(REF_S4, CONST_256, TMP0); | |
1569 vis_padd16(TMP2, TMP6, TMP26); | |
1570 | |
1571 vis_mul8x16au(REF_S4_1, CONST_256, TMP2); | |
1572 vis_padd16(TMP24, TMP8, TMP24); | |
1573 | |
1574 vis_padd16(TMP26, TMP10, TMP26); | |
1575 vis_pack16(TMP24, DST_0); | |
1576 | |
1577 vis_pack16(TMP26, DST_1); | |
1578 vis_st64(DST_0, dest[0]); | |
1579 vis_pmerge(ZERO, REF_S6, TMP4); | |
1580 | |
1581 vis_pmerge(ZERO, REF_S6_1, TMP6); | |
1582 | |
1583 vis_padd16(TMP0, TMP4, TMP0); | |
1584 | |
1585 vis_padd16(TMP2, TMP6, TMP2); | |
1586 | |
1587 vis_padd16(TMP0, TMP12, TMP0); | |
1588 | |
1589 vis_padd16(TMP2, TMP14, TMP2); | |
1590 vis_pack16(TMP0, DST_2); | |
1591 | |
1592 vis_pack16(TMP2, DST_3); | |
1593 vis_st64_2(DST_2, dest, 8); | |
1594 dest += stride; | |
1595 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1596 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1597 |
11458 | 1598 static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 1599 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1600 { |
2979 | 1601 unsigned long off = (unsigned long) ref & 0x7; |
1602 unsigned long off_plus_1 = off + 1; | |
1603 int stride_8 = stride + 8; | |
1604 | |
1605 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
1606 | |
1607 ref = vis_alignaddr(ref); | |
1608 | |
1609 vis_ld64(ref[ 0], TMP0); | |
1610 vis_fzero(ZERO); | |
1611 | |
1612 vis_ld64(ref[ 8], TMP2); | |
1613 | |
1614 vis_ld64(constants2[0], CONST_2); | |
1615 | |
1616 vis_ld64(constants256_512[0], CONST_256); | |
1617 vis_faligndata(TMP0, TMP2, REF_S0); | |
1618 | |
1619 if (off != 0x7) { | |
1620 vis_alignaddr_g0((void *)off_plus_1); | |
1621 vis_faligndata(TMP0, TMP2, REF_S2); | |
1622 } else { | |
1623 vis_src1(TMP2, REF_S2); | |
1624 } | |
1625 | |
1626 height >>= 1; | |
1627 do { /* 26 cycles */ | |
1628 vis_ld64_2(ref, stride, TMP0); | |
1629 vis_mul8x16au(REF_S0, CONST_256, TMP8); | |
1630 vis_pmerge(ZERO, REF_S2, TMP12); | |
1631 | |
1632 vis_alignaddr_g0((void *)off); | |
1633 | |
1634 vis_ld64_2(ref, stride_8, TMP2); | |
1635 ref += stride; | |
1636 vis_mul8x16au(REF_S0_1, CONST_256, TMP10); | |
1637 vis_pmerge(ZERO, REF_S2_1, TMP14); | |
1638 | |
1639 vis_ld64_2(ref, stride, TMP4); | |
1640 | |
1641 vis_ld64_2(ref, stride_8, TMP6); | |
1642 ref += stride; | |
1643 vis_faligndata(TMP0, TMP2, REF_S4); | |
1644 | |
1645 vis_pmerge(ZERO, REF_S4, TMP18); | |
1646 | |
1647 vis_pmerge(ZERO, REF_S4_1, TMP20); | |
1648 | |
1649 vis_faligndata(TMP4, TMP6, REF_S0); | |
1650 | |
1651 if (off != 0x7) { | |
1652 vis_alignaddr_g0((void *)off_plus_1); | |
1653 vis_faligndata(TMP0, TMP2, REF_S6); | |
1654 vis_faligndata(TMP4, TMP6, REF_S2); | |
1655 } else { | |
1656 vis_src1(TMP2, REF_S6); | |
1657 vis_src1(TMP6, REF_S2); | |
1658 } | |
1659 | |
1660 vis_padd16(TMP18, CONST_2, TMP18); | |
1661 vis_mul8x16au(REF_S6, CONST_256, TMP22); | |
1662 | |
1663 vis_padd16(TMP20, CONST_2, TMP20); | |
1664 vis_mul8x16au(REF_S6_1, CONST_256, TMP24); | |
1665 | |
1666 vis_mul8x16au(REF_S0, CONST_256, TMP26); | |
1667 vis_pmerge(ZERO, REF_S0_1, TMP28); | |
1668 | |
1669 vis_mul8x16au(REF_S2, CONST_256, TMP30); | |
1670 vis_padd16(TMP18, TMP22, TMP18); | |
1671 | |
1672 vis_mul8x16au(REF_S2_1, CONST_256, TMP32); | |
1673 vis_padd16(TMP20, TMP24, TMP20); | |
1674 | |
1675 vis_padd16(TMP8, TMP18, TMP8); | |
1676 | |
1677 vis_padd16(TMP10, TMP20, TMP10); | |
1678 | |
1679 vis_padd16(TMP8, TMP12, TMP8); | |
1680 | |
1681 vis_padd16(TMP10, TMP14, TMP10); | |
1682 vis_pack16(TMP8, DST_0); | |
1683 | |
1684 vis_pack16(TMP10, DST_1); | |
1685 vis_st64(DST_0, dest[0]); | |
1686 dest += stride; | |
1687 vis_padd16(TMP18, TMP26, TMP18); | |
1688 | |
1689 vis_padd16(TMP20, TMP28, TMP20); | |
1690 | |
1691 vis_padd16(TMP18, TMP30, TMP18); | |
1692 | |
1693 vis_padd16(TMP20, TMP32, TMP20); | |
1694 vis_pack16(TMP18, DST_2); | |
1695 | |
1696 vis_pack16(TMP20, DST_3); | |
1697 vis_st64(DST_2, dest[0]); | |
1698 dest += stride; | |
1699 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1700 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1701 |
11458 | 1702 static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 1703 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1704 { |
2979 | 1705 unsigned long off = (unsigned long) ref & 0x7; |
1706 unsigned long off_plus_1 = off + 1; | |
1707 int stride_8 = stride + 8; | |
1708 int stride_16 = stride + 16; | |
1709 | |
1710 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT); | |
1711 | |
1712 ref = vis_alignaddr(ref); | |
1713 | |
1714 vis_ld64(ref[ 0], TMP0); | |
1715 vis_fzero(ZERO); | |
1716 | |
1717 vis_ld64(ref[ 8], TMP2); | |
1718 | |
1719 vis_ld64(ref[16], TMP4); | |
1720 | |
1721 vis_ld64(constants6[0], CONST_6); | |
1722 vis_faligndata(TMP0, TMP2, REF_S0); | |
1723 | |
1724 vis_ld64(constants256_1024[0], CONST_256); | |
1725 vis_faligndata(TMP2, TMP4, REF_S4); | |
1726 | |
1727 if (off != 0x7) { | |
1728 vis_alignaddr_g0((void *)off_plus_1); | |
1729 vis_faligndata(TMP0, TMP2, REF_S2); | |
1730 vis_faligndata(TMP2, TMP4, REF_S6); | |
1731 } else { | |
1732 vis_src1(TMP2, REF_S2); | |
1733 vis_src1(TMP4, REF_S6); | |
1734 } | |
1735 | |
1736 height >>= 1; | |
1737 do { /* 55 cycles */ | |
1738 vis_ld64_2(ref, stride, TMP0); | |
1739 vis_mul8x16au(REF_S0, CONST_256, TMP12); | |
1740 vis_pmerge(ZERO, REF_S0_1, TMP14); | |
1741 | |
1742 vis_alignaddr_g0((void *)off); | |
1743 | |
1744 vis_ld64_2(ref, stride_8, TMP2); | |
1745 vis_mul8x16au(REF_S2, CONST_256, TMP16); | |
1746 vis_pmerge(ZERO, REF_S2_1, TMP18); | |
1747 | |
1748 vis_ld64_2(ref, stride_16, TMP4); | |
1749 ref += stride; | |
1750 vis_mul8x16au(REF_S4, CONST_256, TMP20); | |
1751 vis_pmerge(ZERO, REF_S4_1, TMP22); | |
1752 | |
1753 vis_ld64_2(ref, stride, TMP6); | |
1754 vis_mul8x16au(REF_S6, CONST_256, TMP24); | |
1755 vis_pmerge(ZERO, REF_S6_1, TMP26); | |
1756 | |
1757 vis_ld64_2(ref, stride_8, TMP8); | |
1758 vis_faligndata(TMP0, TMP2, REF_0); | |
1759 | |
1760 vis_ld64_2(ref, stride_16, TMP10); | |
1761 ref += stride; | |
1762 vis_faligndata(TMP2, TMP4, REF_4); | |
1763 | |
1764 vis_ld64(dest[0], DST_0); | |
1765 vis_faligndata(TMP6, TMP8, REF_S0); | |
1766 | |
1767 vis_ld64_2(dest, 8, DST_2); | |
1768 vis_faligndata(TMP8, TMP10, REF_S4); | |
1769 | |
1770 if (off != 0x7) { | |
1771 vis_alignaddr_g0((void *)off_plus_1); | |
1772 vis_faligndata(TMP0, TMP2, REF_2); | |
1773 vis_faligndata(TMP2, TMP4, REF_6); | |
1774 vis_faligndata(TMP6, TMP8, REF_S2); | |
1775 vis_faligndata(TMP8, TMP10, REF_S6); | |
1776 } else { | |
1777 vis_src1(TMP2, REF_2); | |
1778 vis_src1(TMP4, REF_6); | |
1779 vis_src1(TMP8, REF_S2); | |
1780 vis_src1(TMP10, REF_S6); | |
1781 } | |
1782 | |
1783 vis_mul8x16al(DST_0, CONST_1024, TMP30); | |
1784 vis_pmerge(ZERO, REF_0, TMP0); | |
1785 | |
1786 vis_mul8x16al(DST_1, CONST_1024, TMP32); | |
1787 vis_pmerge(ZERO, REF_0_1, TMP2); | |
1788 | |
1789 vis_mul8x16au(REF_2, CONST_256, TMP4); | |
1790 vis_pmerge(ZERO, REF_2_1, TMP6); | |
1791 | |
1792 vis_mul8x16al(DST_2, CONST_1024, REF_0); | |
1793 vis_padd16(TMP0, CONST_6, TMP0); | |
1794 | |
1795 vis_mul8x16al(DST_3, CONST_1024, REF_2); | |
1796 vis_padd16(TMP2, CONST_6, TMP2); | |
1797 | |
1798 vis_padd16(TMP0, TMP4, TMP0); | |
1799 vis_mul8x16au(REF_4, CONST_256, TMP4); | |
1800 | |
1801 vis_padd16(TMP2, TMP6, TMP2); | |
1802 vis_mul8x16au(REF_4_1, CONST_256, TMP6); | |
1803 | |
1804 vis_padd16(TMP12, TMP0, TMP12); | |
1805 vis_mul8x16au(REF_6, CONST_256, TMP8); | |
1806 | |
1807 vis_padd16(TMP14, TMP2, TMP14); | |
1808 vis_mul8x16au(REF_6_1, CONST_256, TMP10); | |
1809 | |
1810 vis_padd16(TMP12, TMP16, TMP12); | |
1811 vis_mul8x16au(REF_S0, CONST_256, REF_4); | |
1812 | |
1813 vis_padd16(TMP14, TMP18, TMP14); | |
1814 vis_mul8x16au(REF_S0_1, CONST_256, REF_6); | |
1815 | |
1816 vis_padd16(TMP12, TMP30, TMP12); | |
1817 | |
1818 vis_padd16(TMP14, TMP32, TMP14); | |
1819 vis_pack16(TMP12, DST_0); | |
1820 | |
1821 vis_pack16(TMP14, DST_1); | |
1822 vis_st64(DST_0, dest[0]); | |
1823 vis_padd16(TMP4, CONST_6, TMP4); | |
1824 | |
1825 vis_ld64_2(dest, stride, DST_0); | |
1826 vis_padd16(TMP6, CONST_6, TMP6); | |
1827 vis_mul8x16au(REF_S2, CONST_256, TMP12); | |
1828 | |
1829 vis_padd16(TMP4, TMP8, TMP4); | |
1830 vis_mul8x16au(REF_S2_1, CONST_256, TMP14); | |
1831 | |
1832 vis_padd16(TMP6, TMP10, TMP6); | |
1833 | |
1834 vis_padd16(TMP20, TMP4, TMP20); | |
1835 | |
1836 vis_padd16(TMP22, TMP6, TMP22); | |
1837 | |
1838 vis_padd16(TMP20, TMP24, TMP20); | |
1839 | |
1840 vis_padd16(TMP22, TMP26, TMP22); | |
1841 | |
1842 vis_padd16(TMP20, REF_0, TMP20); | |
1843 vis_mul8x16au(REF_S4, CONST_256, REF_0); | |
1844 | |
1845 vis_padd16(TMP22, REF_2, TMP22); | |
1846 vis_pack16(TMP20, DST_2); | |
1847 | |
1848 vis_pack16(TMP22, DST_3); | |
1849 vis_st64_2(DST_2, dest, 8); | |
1850 dest += stride; | |
1851 | |
1852 vis_ld64_2(dest, 8, DST_2); | |
1853 vis_mul8x16al(DST_0, CONST_1024, TMP30); | |
1854 vis_pmerge(ZERO, REF_S4_1, REF_2); | |
1855 | |
1856 vis_mul8x16al(DST_1, CONST_1024, TMP32); | |
1857 vis_padd16(REF_4, TMP0, TMP8); | |
1858 | |
1859 vis_mul8x16au(REF_S6, CONST_256, REF_4); | |
1860 vis_padd16(REF_6, TMP2, TMP10); | |
1861 | |
1862 vis_mul8x16au(REF_S6_1, CONST_256, REF_6); | |
1863 vis_padd16(TMP8, TMP12, TMP8); | |
1864 | |
1865 vis_padd16(TMP10, TMP14, TMP10); | |
1866 | |
1867 vis_padd16(TMP8, TMP30, TMP8); | |
1868 | |
1869 vis_padd16(TMP10, TMP32, TMP10); | |
1870 vis_pack16(TMP8, DST_0); | |
1871 | |
1872 vis_pack16(TMP10, DST_1); | |
1873 vis_st64(DST_0, dest[0]); | |
1874 | |
1875 vis_padd16(REF_0, TMP4, REF_0); | |
1876 | |
1877 vis_mul8x16al(DST_2, CONST_1024, TMP30); | |
1878 vis_padd16(REF_2, TMP6, REF_2); | |
1879 | |
1880 vis_mul8x16al(DST_3, CONST_1024, TMP32); | |
1881 vis_padd16(REF_0, REF_4, REF_0); | |
1882 | |
1883 vis_padd16(REF_2, REF_6, REF_2); | |
1884 | |
1885 vis_padd16(REF_0, TMP30, REF_0); | |
1886 | |
1887 /* stall */ | |
1888 | |
1889 vis_padd16(REF_2, TMP32, REF_2); | |
1890 vis_pack16(REF_0, DST_2); | |
1891 | |
1892 vis_pack16(REF_2, DST_3); | |
1893 vis_st64_2(DST_2, dest, 8); | |
1894 dest += stride; | |
1895 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1896 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1897 |
11458 | 1898 static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 1899 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1900 { |
2979 | 1901 unsigned long off = (unsigned long) ref & 0x7; |
1902 unsigned long off_plus_1 = off + 1; | |
1903 int stride_8 = stride + 8; | |
1904 | |
1905 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT); | |
1906 | |
1907 ref = vis_alignaddr(ref); | |
1908 | |
1909 vis_ld64(ref[0], TMP0); | |
1910 vis_fzero(ZERO); | |
1911 | |
1912 vis_ld64_2(ref, 8, TMP2); | |
1913 | |
1914 vis_ld64(constants6[0], CONST_6); | |
1915 | |
1916 vis_ld64(constants256_1024[0], CONST_256); | |
1917 vis_faligndata(TMP0, TMP2, REF_S0); | |
1918 | |
1919 if (off != 0x7) { | |
1920 vis_alignaddr_g0((void *)off_plus_1); | |
1921 vis_faligndata(TMP0, TMP2, REF_S2); | |
1922 } else { | |
1923 vis_src1(TMP2, REF_S2); | |
1924 } | |
1925 | |
1926 height >>= 1; | |
1927 do { /* 31 cycles */ | |
1928 vis_ld64_2(ref, stride, TMP0); | |
1929 vis_mul8x16au(REF_S0, CONST_256, TMP8); | |
1930 vis_pmerge(ZERO, REF_S0_1, TMP10); | |
1931 | |
1932 vis_ld64_2(ref, stride_8, TMP2); | |
1933 ref += stride; | |
1934 vis_mul8x16au(REF_S2, CONST_256, TMP12); | |
1935 vis_pmerge(ZERO, REF_S2_1, TMP14); | |
1936 | |
1937 vis_alignaddr_g0((void *)off); | |
1938 | |
1939 vis_ld64_2(ref, stride, TMP4); | |
1940 vis_faligndata(TMP0, TMP2, REF_S4); | |
1941 | |
1942 vis_ld64_2(ref, stride_8, TMP6); | |
1943 ref += stride; | |
1944 | |
1945 vis_ld64(dest[0], DST_0); | |
1946 vis_faligndata(TMP4, TMP6, REF_S0); | |
1947 | |
1948 vis_ld64_2(dest, stride, DST_2); | |
1949 | |
1950 if (off != 0x7) { | |
1951 vis_alignaddr_g0((void *)off_plus_1); | |
1952 vis_faligndata(TMP0, TMP2, REF_S6); | |
1953 vis_faligndata(TMP4, TMP6, REF_S2); | |
1954 } else { | |
1955 vis_src1(TMP2, REF_S6); | |
1956 vis_src1(TMP6, REF_S2); | |
1957 } | |
1958 | |
1959 vis_mul8x16al(DST_0, CONST_1024, TMP30); | |
1960 vis_pmerge(ZERO, REF_S4, TMP22); | |
1961 | |
1962 vis_mul8x16al(DST_1, CONST_1024, TMP32); | |
1963 vis_pmerge(ZERO, REF_S4_1, TMP24); | |
1964 | |
1965 vis_mul8x16au(REF_S6, CONST_256, TMP26); | |
1966 vis_pmerge(ZERO, REF_S6_1, TMP28); | |
1967 | |
1968 vis_mul8x16au(REF_S0, CONST_256, REF_S4); | |
1969 vis_padd16(TMP22, CONST_6, TMP22); | |
1970 | |
1971 vis_mul8x16au(REF_S0_1, CONST_256, REF_S6); | |
1972 vis_padd16(TMP24, CONST_6, TMP24); | |
1973 | |
1974 vis_mul8x16al(DST_2, CONST_1024, REF_0); | |
1975 vis_padd16(TMP22, TMP26, TMP22); | |
1976 | |
1977 vis_mul8x16al(DST_3, CONST_1024, REF_2); | |
1978 vis_padd16(TMP24, TMP28, TMP24); | |
1979 | |
1980 vis_mul8x16au(REF_S2, CONST_256, TMP26); | |
1981 vis_padd16(TMP8, TMP22, TMP8); | |
1982 | |
1983 vis_mul8x16au(REF_S2_1, CONST_256, TMP28); | |
1984 vis_padd16(TMP10, TMP24, TMP10); | |
1985 | |
1986 vis_padd16(TMP8, TMP12, TMP8); | |
1987 | |
1988 vis_padd16(TMP10, TMP14, TMP10); | |
1989 | |
1990 vis_padd16(TMP8, TMP30, TMP8); | |
1991 | |
1992 vis_padd16(TMP10, TMP32, TMP10); | |
1993 vis_pack16(TMP8, DST_0); | |
1994 | |
1995 vis_pack16(TMP10, DST_1); | |
1996 vis_st64(DST_0, dest[0]); | |
1997 dest += stride; | |
1998 | |
1999 vis_padd16(REF_S4, TMP22, TMP12); | |
2000 | |
2001 vis_padd16(REF_S6, TMP24, TMP14); | |
2002 | |
2003 vis_padd16(TMP12, TMP26, TMP12); | |
2004 | |
2005 vis_padd16(TMP14, TMP28, TMP14); | |
2006 | |
2007 vis_padd16(TMP12, REF_0, TMP12); | |
2008 | |
2009 vis_padd16(TMP14, REF_2, TMP14); | |
2010 vis_pack16(TMP12, DST_2); | |
2011 | |
2012 vis_pack16(TMP14, DST_3); | |
2013 vis_st64(DST_2, dest[0]); | |
2014 dest += stride; | |
2015 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2016 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2017 |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2018 /* End of rounding code */ |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2019 |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2020 /* Start of no rounding code */ |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2021 /* The trick used in some of this file is the formula from the MMX |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2022 * motion comp code, which is: |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2023 * |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2024 * (x+y)>>1 == (x&y)+((x^y)>>1) |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2025 * |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2026 * This allows us to average 8 bytes at a time in a 64-bit FPU reg. |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2027 * We avoid overflows by masking before we do the shift, and we |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2028 * implement the shift by multiplying by 1/2 using mul8x16. So in |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2029 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2030 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2031 * the value 0x80808080 is in f8): |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2032 * |
2979 | 2033 * fxor f0, f2, f10 |
2034 * fand f10, f4, f10 | |
2035 * fmul8x16 f8, f10, f10 | |
2036 * fand f10, f6, f10 | |
2037 * fand f0, f2, f12 | |
2038 * fpadd16 f12, f10, f10 | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2039 */ |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2040 |
11458 | 2041 static void MC_put_no_round_o_16_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 2042 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2043 { |
2979 | 2044 ref = vis_alignaddr(ref); |
2045 do { /* 5 cycles */ | |
2046 vis_ld64(ref[0], TMP0); | |
2047 | |
2048 vis_ld64_2(ref, 8, TMP2); | |
2049 | |
2050 vis_ld64_2(ref, 16, TMP4); | |
2051 ref += stride; | |
2052 | |
2053 vis_faligndata(TMP0, TMP2, REF_0); | |
2054 vis_st64(REF_0, dest[0]); | |
2055 | |
2056 vis_faligndata(TMP2, TMP4, REF_2); | |
2057 vis_st64_2(REF_2, dest, 8); | |
2058 dest += stride; | |
2059 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2060 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2061 |
11458 | 2062 static void MC_put_no_round_o_8_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 2063 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2064 { |
2979 | 2065 ref = vis_alignaddr(ref); |
2066 do { /* 4 cycles */ | |
2067 vis_ld64(ref[0], TMP0); | |
2068 | |
2069 vis_ld64(ref[8], TMP2); | |
2070 ref += stride; | |
2071 | |
2072 /* stall */ | |
2073 | |
2074 vis_faligndata(TMP0, TMP2, REF_0); | |
2075 vis_st64(REF_0, dest[0]); | |
2076 dest += stride; | |
2077 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2078 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2079 |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2080 |
11458 | 2081 static void MC_avg_no_round_o_16_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 2082 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2083 { |
2979 | 2084 int stride_8 = stride + 8; |
2085 | |
2086 ref = vis_alignaddr(ref); | |
2087 | |
2088 vis_ld64(ref[0], TMP0); | |
2089 | |
2090 vis_ld64(ref[8], TMP2); | |
2091 | |
2092 vis_ld64(ref[16], TMP4); | |
2093 | |
2094 vis_ld64(dest[0], DST_0); | |
2095 | |
2096 vis_ld64(dest[8], DST_2); | |
2097 | |
2098 vis_ld64(constants_fe[0], MASK_fe); | |
2099 vis_faligndata(TMP0, TMP2, REF_0); | |
2100 | |
2101 vis_ld64(constants_7f[0], MASK_7f); | |
2102 vis_faligndata(TMP2, TMP4, REF_2); | |
2103 | |
2104 vis_ld64(constants128[0], CONST_128); | |
2105 | |
2106 ref += stride; | |
2107 height = (height >> 1) - 1; | |
2108 | |
2109 do { /* 24 cycles */ | |
2110 vis_ld64(ref[0], TMP0); | |
2111 vis_xor(DST_0, REF_0, TMP6); | |
2112 | |
2113 vis_ld64_2(ref, 8, TMP2); | |
2114 vis_and(TMP6, MASK_fe, TMP6); | |
2115 | |
2116 vis_ld64_2(ref, 16, TMP4); | |
2117 ref += stride; | |
2118 vis_mul8x16(CONST_128, TMP6, TMP6); | |
2119 vis_xor(DST_2, REF_2, TMP8); | |
2120 | |
2121 vis_and(TMP8, MASK_fe, TMP8); | |
2122 | |
2123 vis_and(DST_0, REF_0, TMP10); | |
2124 vis_ld64_2(dest, stride, DST_0); | |
2125 vis_mul8x16(CONST_128, TMP8, TMP8); | |
2126 | |
2127 vis_and(DST_2, REF_2, TMP12); | |
2128 vis_ld64_2(dest, stride_8, DST_2); | |
2129 | |
2130 vis_ld64(ref[0], TMP14); | |
2131 vis_and(TMP6, MASK_7f, TMP6); | |
2132 | |
2133 vis_and(TMP8, MASK_7f, TMP8); | |
2134 | |
2135 vis_padd16(TMP10, TMP6, TMP6); | |
2136 vis_st64(TMP6, dest[0]); | |
2137 | |
2138 vis_padd16(TMP12, TMP8, TMP8); | |
2139 vis_st64_2(TMP8, dest, 8); | |
2140 | |
2141 dest += stride; | |
2142 vis_ld64_2(ref, 8, TMP16); | |
2143 vis_faligndata(TMP0, TMP2, REF_0); | |
2144 | |
2145 vis_ld64_2(ref, 16, TMP18); | |
2146 vis_faligndata(TMP2, TMP4, REF_2); | |
2147 ref += stride; | |
2148 | |
2149 vis_xor(DST_0, REF_0, TMP20); | |
2150 | |
2151 vis_and(TMP20, MASK_fe, TMP20); | |
2152 | |
2153 vis_xor(DST_2, REF_2, TMP22); | |
2154 vis_mul8x16(CONST_128, TMP20, TMP20); | |
2155 | |
2156 vis_and(TMP22, MASK_fe, TMP22); | |
2157 | |
2158 vis_and(DST_0, REF_0, TMP24); | |
2159 vis_mul8x16(CONST_128, TMP22, TMP22); | |
2160 | |
2161 vis_and(DST_2, REF_2, TMP26); | |
2162 | |
2163 vis_ld64_2(dest, stride, DST_0); | |
2164 vis_faligndata(TMP14, TMP16, REF_0); | |
2165 | |
2166 vis_ld64_2(dest, stride_8, DST_2); | |
2167 vis_faligndata(TMP16, TMP18, REF_2); | |
2168 | |
2169 vis_and(TMP20, MASK_7f, TMP20); | |
2170 | |
2171 vis_and(TMP22, MASK_7f, TMP22); | |
2172 | |
2173 vis_padd16(TMP24, TMP20, TMP20); | |
2174 vis_st64(TMP20, dest[0]); | |
2175 | |
2176 vis_padd16(TMP26, TMP22, TMP22); | |
2177 vis_st64_2(TMP22, dest, 8); | |
2178 dest += stride; | |
2179 } while (--height); | |
2180 | |
2181 vis_ld64(ref[0], TMP0); | |
2182 vis_xor(DST_0, REF_0, TMP6); | |
2183 | |
2184 vis_ld64_2(ref, 8, TMP2); | |
2185 vis_and(TMP6, MASK_fe, TMP6); | |
2186 | |
2187 vis_ld64_2(ref, 16, TMP4); | |
2188 vis_mul8x16(CONST_128, TMP6, TMP6); | |
2189 vis_xor(DST_2, REF_2, TMP8); | |
2190 | |
2191 vis_and(TMP8, MASK_fe, TMP8); | |
2192 | |
2193 vis_and(DST_0, REF_0, TMP10); | |
2194 vis_ld64_2(dest, stride, DST_0); | |
2195 vis_mul8x16(CONST_128, TMP8, TMP8); | |
2196 | |
2197 vis_and(DST_2, REF_2, TMP12); | |
2198 vis_ld64_2(dest, stride_8, DST_2); | |
2199 | |
2200 vis_ld64(ref[0], TMP14); | |
2201 vis_and(TMP6, MASK_7f, TMP6); | |
2202 | |
2203 vis_and(TMP8, MASK_7f, TMP8); | |
2204 | |
2205 vis_padd16(TMP10, TMP6, TMP6); | |
2206 vis_st64(TMP6, dest[0]); | |
2207 | |
2208 vis_padd16(TMP12, TMP8, TMP8); | |
2209 vis_st64_2(TMP8, dest, 8); | |
2210 | |
2211 dest += stride; | |
2212 vis_faligndata(TMP0, TMP2, REF_0); | |
2213 | |
2214 vis_faligndata(TMP2, TMP4, REF_2); | |
2215 | |
2216 vis_xor(DST_0, REF_0, TMP20); | |
2217 | |
2218 vis_and(TMP20, MASK_fe, TMP20); | |
2219 | |
2220 vis_xor(DST_2, REF_2, TMP22); | |
2221 vis_mul8x16(CONST_128, TMP20, TMP20); | |
2222 | |
2223 vis_and(TMP22, MASK_fe, TMP22); | |
2224 | |
2225 vis_and(DST_0, REF_0, TMP24); | |
2226 vis_mul8x16(CONST_128, TMP22, TMP22); | |
2227 | |
2228 vis_and(DST_2, REF_2, TMP26); | |
2229 | |
2230 vis_and(TMP20, MASK_7f, TMP20); | |
2231 | |
2232 vis_and(TMP22, MASK_7f, TMP22); | |
2233 | |
2234 vis_padd16(TMP24, TMP20, TMP20); | |
2235 vis_st64(TMP20, dest[0]); | |
2236 | |
2237 vis_padd16(TMP26, TMP22, TMP22); | |
2238 vis_st64_2(TMP22, dest, 8); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2239 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2240 |
11458 | 2241 static void MC_avg_no_round_o_8_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 2242 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2243 { |
2979 | 2244 ref = vis_alignaddr(ref); |
2245 | |
2246 vis_ld64(ref[0], TMP0); | |
2247 | |
2248 vis_ld64(ref[8], TMP2); | |
2249 | |
2250 vis_ld64(dest[0], DST_0); | |
2251 | |
2252 vis_ld64(constants_fe[0], MASK_fe); | |
2253 | |
2254 vis_ld64(constants_7f[0], MASK_7f); | |
2255 vis_faligndata(TMP0, TMP2, REF_0); | |
2256 | |
2257 vis_ld64(constants128[0], CONST_128); | |
2258 | |
2259 ref += stride; | |
2260 height = (height >> 1) - 1; | |
2261 | |
2262 do { /* 12 cycles */ | |
2263 vis_ld64(ref[0], TMP0); | |
2264 vis_xor(DST_0, REF_0, TMP4); | |
2265 | |
2266 vis_ld64(ref[8], TMP2); | |
2267 vis_and(TMP4, MASK_fe, TMP4); | |
2268 | |
2269 vis_and(DST_0, REF_0, TMP6); | |
2270 vis_ld64_2(dest, stride, DST_0); | |
2271 ref += stride; | |
2272 vis_mul8x16(CONST_128, TMP4, TMP4); | |
2273 | |
2274 vis_ld64(ref[0], TMP12); | |
2275 vis_faligndata(TMP0, TMP2, REF_0); | |
2276 | |
2277 vis_ld64(ref[8], TMP2); | |
2278 vis_xor(DST_0, REF_0, TMP0); | |
2279 ref += stride; | |
2280 | |
2281 vis_and(TMP0, MASK_fe, TMP0); | |
2282 | |
2283 vis_and(TMP4, MASK_7f, TMP4); | |
2284 | |
2285 vis_padd16(TMP6, TMP4, TMP4); | |
2286 vis_st64(TMP4, dest[0]); | |
2287 dest += stride; | |
2288 vis_mul8x16(CONST_128, TMP0, TMP0); | |
2289 | |
2290 vis_and(DST_0, REF_0, TMP6); | |
2291 vis_ld64_2(dest, stride, DST_0); | |
2292 | |
2293 vis_faligndata(TMP12, TMP2, REF_0); | |
2294 | |
2295 vis_and(TMP0, MASK_7f, TMP0); | |
2296 | |
2297 vis_padd16(TMP6, TMP0, TMP4); | |
2298 vis_st64(TMP4, dest[0]); | |
2299 dest += stride; | |
2300 } while (--height); | |
2301 | |
2302 vis_ld64(ref[0], TMP0); | |
2303 vis_xor(DST_0, REF_0, TMP4); | |
2304 | |
2305 vis_ld64(ref[8], TMP2); | |
2306 vis_and(TMP4, MASK_fe, TMP4); | |
2307 | |
2308 vis_and(DST_0, REF_0, TMP6); | |
2309 vis_ld64_2(dest, stride, DST_0); | |
2310 vis_mul8x16(CONST_128, TMP4, TMP4); | |
2311 | |
2312 vis_faligndata(TMP0, TMP2, REF_0); | |
2313 | |
2314 vis_xor(DST_0, REF_0, TMP0); | |
2315 | |
2316 vis_and(TMP0, MASK_fe, TMP0); | |
2317 | |
2318 vis_and(TMP4, MASK_7f, TMP4); | |
2319 | |
2320 vis_padd16(TMP6, TMP4, TMP4); | |
2321 vis_st64(TMP4, dest[0]); | |
2322 dest += stride; | |
2323 vis_mul8x16(CONST_128, TMP0, TMP0); | |
2324 | |
2325 vis_and(DST_0, REF_0, TMP6); | |
2326 | |
2327 vis_and(TMP0, MASK_7f, TMP0); | |
2328 | |
2329 vis_padd16(TMP6, TMP0, TMP4); | |
2330 vis_st64(TMP4, dest[0]); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2331 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2332 |
11458 | 2333 static void MC_put_no_round_x_16_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 2334 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2335 { |
2979 | 2336 unsigned long off = (unsigned long) ref & 0x7; |
2337 unsigned long off_plus_1 = off + 1; | |
2338 | |
2339 ref = vis_alignaddr(ref); | |
2340 | |
2341 vis_ld64(ref[0], TMP0); | |
2342 | |
2343 vis_ld64_2(ref, 8, TMP2); | |
2344 | |
2345 vis_ld64_2(ref, 16, TMP4); | |
2346 | |
2347 vis_ld64(constants_fe[0], MASK_fe); | |
2348 | |
2349 vis_ld64(constants_7f[0], MASK_7f); | |
2350 vis_faligndata(TMP0, TMP2, REF_0); | |
2351 | |
2352 vis_ld64(constants128[0], CONST_128); | |
2353 vis_faligndata(TMP2, TMP4, REF_4); | |
2354 | |
2355 if (off != 0x7) { | |
2356 vis_alignaddr_g0((void *)off_plus_1); | |
2357 vis_faligndata(TMP0, TMP2, REF_2); | |
2358 vis_faligndata(TMP2, TMP4, REF_6); | |
2359 } else { | |
2360 vis_src1(TMP2, REF_2); | |
2361 vis_src1(TMP4, REF_6); | |
2362 } | |
2363 | |
2364 ref += stride; | |
2365 height = (height >> 1) - 1; | |
2366 | |
2367 do { /* 34 cycles */ | |
2368 vis_ld64(ref[0], TMP0); | |
2369 vis_xor(REF_0, REF_2, TMP6); | |
2370 | |
2371 vis_ld64_2(ref, 8, TMP2); | |
2372 vis_xor(REF_4, REF_6, TMP8); | |
2373 | |
2374 vis_ld64_2(ref, 16, TMP4); | |
2375 vis_and(TMP6, MASK_fe, TMP6); | |
2376 ref += stride; | |
2377 | |
2378 vis_ld64(ref[0], TMP14); | |
2379 vis_mul8x16(CONST_128, TMP6, TMP6); | |
2380 vis_and(TMP8, MASK_fe, TMP8); | |
2381 | |
2382 vis_ld64_2(ref, 8, TMP16); | |
2383 vis_mul8x16(CONST_128, TMP8, TMP8); | |
2384 vis_and(REF_0, REF_2, TMP10); | |
2385 | |
2386 vis_ld64_2(ref, 16, TMP18); | |
2387 ref += stride; | |
2388 vis_and(REF_4, REF_6, TMP12); | |
2389 | |
2390 vis_alignaddr_g0((void *)off); | |
2391 | |
2392 vis_faligndata(TMP0, TMP2, REF_0); | |
2393 | |
2394 vis_faligndata(TMP2, TMP4, REF_4); | |
2395 | |
2396 if (off != 0x7) { | |
2397 vis_alignaddr_g0((void *)off_plus_1); | |
2398 vis_faligndata(TMP0, TMP2, REF_2); | |
2399 vis_faligndata(TMP2, TMP4, REF_6); | |
2400 } else { | |
2401 vis_src1(TMP2, REF_2); | |
2402 vis_src1(TMP4, REF_6); | |
2403 } | |
2404 | |
2405 vis_and(TMP6, MASK_7f, TMP6); | |
2406 | |
2407 vis_and(TMP8, MASK_7f, TMP8); | |
2408 | |
2409 vis_padd16(TMP10, TMP6, TMP6); | |
2410 vis_st64(TMP6, dest[0]); | |
2411 | |
2412 vis_padd16(TMP12, TMP8, TMP8); | |
2413 vis_st64_2(TMP8, dest, 8); | |
2414 dest += stride; | |
2415 | |
2416 vis_xor(REF_0, REF_2, TMP6); | |
2417 | |
2418 vis_xor(REF_4, REF_6, TMP8); | |
2419 | |
2420 vis_and(TMP6, MASK_fe, TMP6); | |
2421 | |
2422 vis_mul8x16(CONST_128, TMP6, TMP6); | |
2423 vis_and(TMP8, MASK_fe, TMP8); | |
2424 | |
2425 vis_mul8x16(CONST_128, TMP8, TMP8); | |
2426 vis_and(REF_0, REF_2, TMP10); | |
2427 | |
2428 vis_and(REF_4, REF_6, TMP12); | |
2429 | |
2430 vis_alignaddr_g0((void *)off); | |
2431 | |
2432 vis_faligndata(TMP14, TMP16, REF_0); | |
2433 | |
2434 vis_faligndata(TMP16, TMP18, REF_4); | |
2435 | |
2436 if (off != 0x7) { | |
2437 vis_alignaddr_g0((void *)off_plus_1); | |
2438 vis_faligndata(TMP14, TMP16, REF_2); | |
2439 vis_faligndata(TMP16, TMP18, REF_6); | |
2440 } else { | |
2441 vis_src1(TMP16, REF_2); | |
2442 vis_src1(TMP18, REF_6); | |
2443 } | |
2444 | |
2445 vis_and(TMP6, MASK_7f, TMP6); | |
2446 | |
2447 vis_and(TMP8, MASK_7f, TMP8); | |
2448 | |
2449 vis_padd16(TMP10, TMP6, TMP6); | |
2450 vis_st64(TMP6, dest[0]); | |
2451 | |
2452 vis_padd16(TMP12, TMP8, TMP8); | |
2453 vis_st64_2(TMP8, dest, 8); | |
2454 dest += stride; | |
2455 } while (--height); | |
2456 | |
2457 vis_ld64(ref[0], TMP0); | |
2458 vis_xor(REF_0, REF_2, TMP6); | |
2459 | |
2460 vis_ld64_2(ref, 8, TMP2); | |
2461 vis_xor(REF_4, REF_6, TMP8); | |
2462 | |
2463 vis_ld64_2(ref, 16, TMP4); | |
2464 vis_and(TMP6, MASK_fe, TMP6); | |
2465 | |
2466 vis_mul8x16(CONST_128, TMP6, TMP6); | |
2467 vis_and(TMP8, MASK_fe, TMP8); | |
2468 | |
2469 vis_mul8x16(CONST_128, TMP8, TMP8); | |
2470 vis_and(REF_0, REF_2, TMP10); | |
2471 | |
2472 vis_and(REF_4, REF_6, TMP12); | |
2473 | |
2474 vis_alignaddr_g0((void *)off); | |
2475 | |
2476 vis_faligndata(TMP0, TMP2, REF_0); | |
2477 | |
2478 vis_faligndata(TMP2, TMP4, REF_4); | |
2479 | |
2480 if (off != 0x7) { | |
2481 vis_alignaddr_g0((void *)off_plus_1); | |
2482 vis_faligndata(TMP0, TMP2, REF_2); | |
2483 vis_faligndata(TMP2, TMP4, REF_6); | |
2484 } else { | |
2485 vis_src1(TMP2, REF_2); | |
2486 vis_src1(TMP4, REF_6); | |
2487 } | |
2488 | |
2489 vis_and(TMP6, MASK_7f, TMP6); | |
2490 | |
2491 vis_and(TMP8, MASK_7f, TMP8); | |
2492 | |
2493 vis_padd16(TMP10, TMP6, TMP6); | |
2494 vis_st64(TMP6, dest[0]); | |
2495 | |
2496 vis_padd16(TMP12, TMP8, TMP8); | |
2497 vis_st64_2(TMP8, dest, 8); | |
2498 dest += stride; | |
2499 | |
2500 vis_xor(REF_0, REF_2, TMP6); | |
2501 | |
2502 vis_xor(REF_4, REF_6, TMP8); | |
2503 | |
2504 vis_and(TMP6, MASK_fe, TMP6); | |
2505 | |
2506 vis_mul8x16(CONST_128, TMP6, TMP6); | |
2507 vis_and(TMP8, MASK_fe, TMP8); | |
2508 | |
2509 vis_mul8x16(CONST_128, TMP8, TMP8); | |
2510 vis_and(REF_0, REF_2, TMP10); | |
2511 | |
2512 vis_and(REF_4, REF_6, TMP12); | |
2513 | |
2514 vis_and(TMP6, MASK_7f, TMP6); | |
2515 | |
2516 vis_and(TMP8, MASK_7f, TMP8); | |
2517 | |
2518 vis_padd16(TMP10, TMP6, TMP6); | |
2519 vis_st64(TMP6, dest[0]); | |
2520 | |
2521 vis_padd16(TMP12, TMP8, TMP8); | |
2522 vis_st64_2(TMP8, dest, 8); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2523 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2524 |
11458 | 2525 static void MC_put_no_round_x_8_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 2526 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2527 { |
2979 | 2528 unsigned long off = (unsigned long) ref & 0x7; |
2529 unsigned long off_plus_1 = off + 1; | |
2530 | |
2531 ref = vis_alignaddr(ref); | |
2532 | |
2533 vis_ld64(ref[0], TMP0); | |
2534 | |
2535 vis_ld64(ref[8], TMP2); | |
2536 | |
2537 vis_ld64(constants_fe[0], MASK_fe); | |
2538 | |
2539 vis_ld64(constants_7f[0], MASK_7f); | |
2540 | |
2541 vis_ld64(constants128[0], CONST_128); | |
2542 vis_faligndata(TMP0, TMP2, REF_0); | |
2543 | |
2544 if (off != 0x7) { | |
2545 vis_alignaddr_g0((void *)off_plus_1); | |
2546 vis_faligndata(TMP0, TMP2, REF_2); | |
2547 } else { | |
2548 vis_src1(TMP2, REF_2); | |
2549 } | |
2550 | |
2551 ref += stride; | |
2552 height = (height >> 1) - 1; | |
2553 | |
2554 do { /* 20 cycles */ | |
2555 vis_ld64(ref[0], TMP0); | |
2556 vis_xor(REF_0, REF_2, TMP4); | |
2557 | |
2558 vis_ld64_2(ref, 8, TMP2); | |
2559 vis_and(TMP4, MASK_fe, TMP4); | |
2560 ref += stride; | |
2561 | |
2562 vis_ld64(ref[0], TMP8); | |
2563 vis_and(REF_0, REF_2, TMP6); | |
2564 vis_mul8x16(CONST_128, TMP4, TMP4); | |
2565 | |
2566 vis_alignaddr_g0((void *)off); | |
2567 | |
2568 vis_ld64_2(ref, 8, TMP10); | |
2569 ref += stride; | |
2570 vis_faligndata(TMP0, TMP2, REF_0); | |
2571 | |
2572 if (off != 0x7) { | |
2573 vis_alignaddr_g0((void *)off_plus_1); | |
2574 vis_faligndata(TMP0, TMP2, REF_2); | |
2575 } else { | |
2576 vis_src1(TMP2, REF_2); | |
2577 } | |
2578 | |
2579 vis_and(TMP4, MASK_7f, TMP4); | |
2580 | |
2581 vis_padd16(TMP6, TMP4, DST_0); | |
2582 vis_st64(DST_0, dest[0]); | |
2583 dest += stride; | |
2584 | |
2585 vis_xor(REF_0, REF_2, TMP12); | |
2586 | |
2587 vis_and(TMP12, MASK_fe, TMP12); | |
2588 | |
2589 vis_and(REF_0, REF_2, TMP14); | |
2590 vis_mul8x16(CONST_128, TMP12, TMP12); | |
2591 | |
2592 vis_alignaddr_g0((void *)off); | |
2593 vis_faligndata(TMP8, TMP10, REF_0); | |
2594 if (off != 0x7) { | |
2595 vis_alignaddr_g0((void *)off_plus_1); | |
2596 vis_faligndata(TMP8, TMP10, REF_2); | |
2597 } else { | |
2598 vis_src1(TMP10, REF_2); | |
2599 } | |
2600 | |
2601 vis_and(TMP12, MASK_7f, TMP12); | |
2602 | |
2603 vis_padd16(TMP14, TMP12, DST_0); | |
2604 vis_st64(DST_0, dest[0]); | |
2605 dest += stride; | |
2606 } while (--height); | |
2607 | |
2608 vis_ld64(ref[0], TMP0); | |
2609 vis_xor(REF_0, REF_2, TMP4); | |
2610 | |
2611 vis_ld64_2(ref, 8, TMP2); | |
2612 vis_and(TMP4, MASK_fe, TMP4); | |
2613 | |
2614 vis_and(REF_0, REF_2, TMP6); | |
2615 vis_mul8x16(CONST_128, TMP4, TMP4); | |
2616 | |
2617 vis_alignaddr_g0((void *)off); | |
2618 | |
2619 vis_faligndata(TMP0, TMP2, REF_0); | |
2620 | |
2621 if (off != 0x7) { | |
2622 vis_alignaddr_g0((void *)off_plus_1); | |
2623 vis_faligndata(TMP0, TMP2, REF_2); | |
2624 } else { | |
2625 vis_src1(TMP2, REF_2); | |
2626 } | |
2627 | |
2628 vis_and(TMP4, MASK_7f, TMP4); | |
2629 | |
2630 vis_padd16(TMP6, TMP4, DST_0); | |
2631 vis_st64(DST_0, dest[0]); | |
2632 dest += stride; | |
2633 | |
2634 vis_xor(REF_0, REF_2, TMP12); | |
2635 | |
2636 vis_and(TMP12, MASK_fe, TMP12); | |
2637 | |
2638 vis_and(REF_0, REF_2, TMP14); | |
2639 vis_mul8x16(CONST_128, TMP12, TMP12); | |
2640 | |
2641 vis_and(TMP12, MASK_7f, TMP12); | |
2642 | |
2643 vis_padd16(TMP14, TMP12, DST_0); | |
2644 vis_st64(DST_0, dest[0]); | |
2645 dest += stride; | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2646 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2647 |
11458 | 2648 static void MC_avg_no_round_x_16_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 2649 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2650 { |
2979 | 2651 unsigned long off = (unsigned long) ref & 0x7; |
2652 unsigned long off_plus_1 = off + 1; | |
2653 | |
2654 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
2655 | |
2656 vis_ld64(constants3[0], CONST_3); | |
2657 vis_fzero(ZERO); | |
2658 vis_ld64(constants256_512[0], CONST_256); | |
2659 | |
2660 ref = vis_alignaddr(ref); | |
2661 do { /* 26 cycles */ | |
2662 vis_ld64(ref[0], TMP0); | |
2663 | |
2664 vis_ld64(ref[8], TMP2); | |
2665 | |
2666 vis_alignaddr_g0((void *)off); | |
2667 | |
2668 vis_ld64(ref[16], TMP4); | |
2669 | |
2670 vis_ld64(dest[0], DST_0); | |
2671 vis_faligndata(TMP0, TMP2, REF_0); | |
2672 | |
2673 vis_ld64(dest[8], DST_2); | |
2674 vis_faligndata(TMP2, TMP4, REF_4); | |
2675 | |
2676 if (off != 0x7) { | |
2677 vis_alignaddr_g0((void *)off_plus_1); | |
2678 vis_faligndata(TMP0, TMP2, REF_2); | |
2679 vis_faligndata(TMP2, TMP4, REF_6); | |
2680 } else { | |
2681 vis_src1(TMP2, REF_2); | |
2682 vis_src1(TMP4, REF_6); | |
2683 } | |
2684 | |
2685 vis_mul8x16au(REF_0, CONST_256, TMP0); | |
2686 | |
2687 vis_pmerge(ZERO, REF_2, TMP4); | |
2688 vis_mul8x16au(REF_0_1, CONST_256, TMP2); | |
2689 | |
2690 vis_pmerge(ZERO, REF_2_1, TMP6); | |
2691 | |
2692 vis_padd16(TMP0, TMP4, TMP0); | |
2693 | |
2694 vis_mul8x16al(DST_0, CONST_512, TMP4); | |
2695 vis_padd16(TMP2, TMP6, TMP2); | |
2696 | |
2697 vis_mul8x16al(DST_1, CONST_512, TMP6); | |
2698 | |
2699 vis_mul8x16au(REF_6, CONST_256, TMP12); | |
2700 | |
2701 vis_padd16(TMP0, TMP4, TMP0); | |
2702 vis_mul8x16au(REF_6_1, CONST_256, TMP14); | |
2703 | |
2704 vis_padd16(TMP2, TMP6, TMP2); | |
2705 vis_mul8x16au(REF_4, CONST_256, TMP16); | |
2706 | |
2707 vis_padd16(TMP0, CONST_3, TMP8); | |
2708 vis_mul8x16au(REF_4_1, CONST_256, TMP18); | |
2709 | |
2710 vis_padd16(TMP2, CONST_3, TMP10); | |
2711 vis_pack16(TMP8, DST_0); | |
2712 | |
2713 vis_pack16(TMP10, DST_1); | |
2714 vis_padd16(TMP16, TMP12, TMP0); | |
2715 | |
2716 vis_st64(DST_0, dest[0]); | |
2717 vis_mul8x16al(DST_2, CONST_512, TMP4); | |
2718 vis_padd16(TMP18, TMP14, TMP2); | |
2719 | |
2720 vis_mul8x16al(DST_3, CONST_512, TMP6); | |
2721 vis_padd16(TMP0, CONST_3, TMP0); | |
2722 | |
2723 vis_padd16(TMP2, CONST_3, TMP2); | |
2724 | |
2725 vis_padd16(TMP0, TMP4, TMP0); | |
2726 | |
2727 vis_padd16(TMP2, TMP6, TMP2); | |
2728 vis_pack16(TMP0, DST_2); | |
2729 | |
2730 vis_pack16(TMP2, DST_3); | |
2731 vis_st64(DST_2, dest[8]); | |
2732 | |
2733 ref += stride; | |
2734 dest += stride; | |
2735 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2736 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2737 |
11458 | 2738 static void MC_avg_no_round_x_8_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 2739 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2740 { |
2979 | 2741 unsigned long off = (unsigned long) ref & 0x7; |
2742 unsigned long off_plus_1 = off + 1; | |
2743 int stride_times_2 = stride << 1; | |
2744 | |
2745 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
2746 | |
2747 vis_ld64(constants3[0], CONST_3); | |
2748 vis_fzero(ZERO); | |
2749 vis_ld64(constants256_512[0], CONST_256); | |
2750 | |
2751 ref = vis_alignaddr(ref); | |
2752 height >>= 2; | |
2753 do { /* 47 cycles */ | |
2754 vis_ld64(ref[0], TMP0); | |
2755 | |
2756 vis_ld64_2(ref, 8, TMP2); | |
2757 ref += stride; | |
2758 | |
2759 vis_alignaddr_g0((void *)off); | |
2760 | |
2761 vis_ld64(ref[0], TMP4); | |
2762 vis_faligndata(TMP0, TMP2, REF_0); | |
2763 | |
2764 vis_ld64_2(ref, 8, TMP6); | |
2765 ref += stride; | |
2766 | |
2767 vis_ld64(ref[0], TMP8); | |
2768 | |
2769 vis_ld64_2(ref, 8, TMP10); | |
2770 ref += stride; | |
2771 vis_faligndata(TMP4, TMP6, REF_4); | |
2772 | |
2773 vis_ld64(ref[0], TMP12); | |
2774 | |
2775 vis_ld64_2(ref, 8, TMP14); | |
2776 ref += stride; | |
2777 vis_faligndata(TMP8, TMP10, REF_S0); | |
2778 | |
2779 vis_faligndata(TMP12, TMP14, REF_S4); | |
2780 | |
2781 if (off != 0x7) { | |
2782 vis_alignaddr_g0((void *)off_plus_1); | |
2783 | |
2784 vis_ld64(dest[0], DST_0); | |
2785 vis_faligndata(TMP0, TMP2, REF_2); | |
2786 | |
2787 vis_ld64_2(dest, stride, DST_2); | |
2788 vis_faligndata(TMP4, TMP6, REF_6); | |
2789 | |
2790 vis_faligndata(TMP8, TMP10, REF_S2); | |
2791 | |
2792 vis_faligndata(TMP12, TMP14, REF_S6); | |
2793 } else { | |
2794 vis_ld64(dest[0], DST_0); | |
2795 vis_src1(TMP2, REF_2); | |
2796 | |
2797 vis_ld64_2(dest, stride, DST_2); | |
2798 vis_src1(TMP6, REF_6); | |
2799 | |
2800 vis_src1(TMP10, REF_S2); | |
2801 | |
2802 vis_src1(TMP14, REF_S6); | |
2803 } | |
2804 | |
2805 vis_pmerge(ZERO, REF_0, TMP0); | |
2806 vis_mul8x16au(REF_0_1, CONST_256, TMP2); | |
2807 | |
2808 vis_pmerge(ZERO, REF_2, TMP4); | |
2809 vis_mul8x16au(REF_2_1, CONST_256, TMP6); | |
2810 | |
2811 vis_padd16(TMP0, CONST_3, TMP0); | |
2812 vis_mul8x16al(DST_0, CONST_512, TMP16); | |
2813 | |
2814 vis_padd16(TMP2, CONST_3, TMP2); | |
2815 vis_mul8x16al(DST_1, CONST_512, TMP18); | |
2816 | |
2817 vis_padd16(TMP0, TMP4, TMP0); | |
2818 vis_mul8x16au(REF_4, CONST_256, TMP8); | |
2819 | |
2820 vis_padd16(TMP2, TMP6, TMP2); | |
2821 vis_mul8x16au(REF_4_1, CONST_256, TMP10); | |
2822 | |
2823 vis_padd16(TMP0, TMP16, TMP0); | |
2824 vis_mul8x16au(REF_6, CONST_256, TMP12); | |
2825 | |
2826 vis_padd16(TMP2, TMP18, TMP2); | |
2827 vis_mul8x16au(REF_6_1, CONST_256, TMP14); | |
2828 | |
2829 vis_padd16(TMP8, CONST_3, TMP8); | |
2830 vis_mul8x16al(DST_2, CONST_512, TMP16); | |
2831 | |
2832 vis_padd16(TMP8, TMP12, TMP8); | |
2833 vis_mul8x16al(DST_3, CONST_512, TMP18); | |
2834 | |
2835 vis_padd16(TMP10, TMP14, TMP10); | |
2836 vis_pack16(TMP0, DST_0); | |
2837 | |
2838 vis_pack16(TMP2, DST_1); | |
2839 vis_st64(DST_0, dest[0]); | |
2840 dest += stride; | |
2841 vis_padd16(TMP10, CONST_3, TMP10); | |
2842 | |
2843 vis_ld64_2(dest, stride, DST_0); | |
2844 vis_padd16(TMP8, TMP16, TMP8); | |
2845 | |
2846 vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/); | |
2847 vis_padd16(TMP10, TMP18, TMP10); | |
2848 vis_pack16(TMP8, DST_2); | |
2849 | |
2850 vis_pack16(TMP10, DST_3); | |
2851 vis_st64(DST_2, dest[0]); | |
2852 dest += stride; | |
2853 | |
2854 vis_mul8x16au(REF_S0_1, CONST_256, TMP2); | |
2855 vis_pmerge(ZERO, REF_S0, TMP0); | |
2856 | |
2857 vis_pmerge(ZERO, REF_S2, TMP24); | |
2858 vis_mul8x16au(REF_S2_1, CONST_256, TMP6); | |
2859 | |
2860 vis_padd16(TMP0, CONST_3, TMP0); | |
2861 vis_mul8x16au(REF_S4, CONST_256, TMP8); | |
2862 | |
2863 vis_padd16(TMP2, CONST_3, TMP2); | |
2864 vis_mul8x16au(REF_S4_1, CONST_256, TMP10); | |
2865 | |
2866 vis_padd16(TMP0, TMP24, TMP0); | |
2867 vis_mul8x16au(REF_S6, CONST_256, TMP12); | |
2868 | |
2869 vis_padd16(TMP2, TMP6, TMP2); | |
2870 vis_mul8x16au(REF_S6_1, CONST_256, TMP14); | |
2871 | |
2872 vis_padd16(TMP8, CONST_3, TMP8); | |
2873 vis_mul8x16al(DST_0, CONST_512, TMP16); | |
2874 | |
2875 vis_padd16(TMP10, CONST_3, TMP10); | |
2876 vis_mul8x16al(DST_1, CONST_512, TMP18); | |
2877 | |
2878 vis_padd16(TMP8, TMP12, TMP8); | |
2879 vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20); | |
2880 | |
2881 vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22); | |
2882 vis_padd16(TMP0, TMP16, TMP0); | |
2883 | |
2884 vis_padd16(TMP2, TMP18, TMP2); | |
2885 vis_pack16(TMP0, DST_0); | |
2886 | |
2887 vis_padd16(TMP10, TMP14, TMP10); | |
2888 vis_pack16(TMP2, DST_1); | |
2889 vis_st64(DST_0, dest[0]); | |
2890 dest += stride; | |
2891 | |
2892 vis_padd16(TMP8, TMP20, TMP8); | |
2893 | |
2894 vis_padd16(TMP10, TMP22, TMP10); | |
2895 vis_pack16(TMP8, DST_2); | |
2896 | |
2897 vis_pack16(TMP10, DST_3); | |
2898 vis_st64(DST_2, dest[0]); | |
2899 dest += stride; | |
2900 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2901 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2902 |
11458 | 2903 static void MC_put_no_round_y_16_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 2904 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2905 { |
2979 | 2906 ref = vis_alignaddr(ref); |
2907 vis_ld64(ref[0], TMP0); | |
2908 | |
2909 vis_ld64_2(ref, 8, TMP2); | |
2910 | |
2911 vis_ld64_2(ref, 16, TMP4); | |
2912 ref += stride; | |
2913 | |
2914 vis_ld64(ref[0], TMP6); | |
2915 vis_faligndata(TMP0, TMP2, REF_0); | |
2916 | |
2917 vis_ld64_2(ref, 8, TMP8); | |
2918 vis_faligndata(TMP2, TMP4, REF_4); | |
2919 | |
2920 vis_ld64_2(ref, 16, TMP10); | |
2921 ref += stride; | |
2922 | |
2923 vis_ld64(constants_fe[0], MASK_fe); | |
2924 vis_faligndata(TMP6, TMP8, REF_2); | |
2925 | |
2926 vis_ld64(constants_7f[0], MASK_7f); | |
2927 vis_faligndata(TMP8, TMP10, REF_6); | |
2928 | |
2929 vis_ld64(constants128[0], CONST_128); | |
2930 height = (height >> 1) - 1; | |
2931 do { /* 24 cycles */ | |
2932 vis_ld64(ref[0], TMP0); | |
2933 vis_xor(REF_0, REF_2, TMP12); | |
2934 | |
2935 vis_ld64_2(ref, 8, TMP2); | |
2936 vis_xor(REF_4, REF_6, TMP16); | |
2937 | |
2938 vis_ld64_2(ref, 16, TMP4); | |
2939 ref += stride; | |
2940 vis_and(REF_0, REF_2, TMP14); | |
2941 | |
2942 vis_ld64(ref[0], TMP6); | |
2943 vis_and(REF_4, REF_6, TMP18); | |
2944 | |
2945 vis_ld64_2(ref, 8, TMP8); | |
2946 vis_faligndata(TMP0, TMP2, REF_0); | |
2947 | |
2948 vis_ld64_2(ref, 16, TMP10); | |
2949 ref += stride; | |
2950 vis_faligndata(TMP2, TMP4, REF_4); | |
2951 | |
2952 vis_and(TMP12, MASK_fe, TMP12); | |
2953 | |
2954 vis_and(TMP16, MASK_fe, TMP16); | |
2955 vis_mul8x16(CONST_128, TMP12, TMP12); | |
2956 | |
2957 vis_mul8x16(CONST_128, TMP16, TMP16); | |
2958 vis_xor(REF_0, REF_2, TMP0); | |
2959 | |
2960 vis_xor(REF_4, REF_6, TMP2); | |
2961 | |
2962 vis_and(REF_0, REF_2, TMP20); | |
2963 | |
2964 vis_and(TMP12, MASK_7f, TMP12); | |
2965 | |
2966 vis_and(TMP16, MASK_7f, TMP16); | |
2967 | |
2968 vis_padd16(TMP14, TMP12, TMP12); | |
2969 vis_st64(TMP12, dest[0]); | |
2970 | |
2971 vis_padd16(TMP18, TMP16, TMP16); | |
2972 vis_st64_2(TMP16, dest, 8); | |
2973 dest += stride; | |
2974 | |
2975 vis_and(REF_4, REF_6, TMP18); | |
2976 | |
2977 vis_and(TMP0, MASK_fe, TMP0); | |
2978 | |
2979 vis_and(TMP2, MASK_fe, TMP2); | |
2980 vis_mul8x16(CONST_128, TMP0, TMP0); | |
2981 | |
2982 vis_faligndata(TMP6, TMP8, REF_2); | |
2983 vis_mul8x16(CONST_128, TMP2, TMP2); | |
2984 | |
2985 vis_faligndata(TMP8, TMP10, REF_6); | |
2986 | |
2987 vis_and(TMP0, MASK_7f, TMP0); | |
2988 | |
2989 vis_and(TMP2, MASK_7f, TMP2); | |
2990 | |
2991 vis_padd16(TMP20, TMP0, TMP0); | |
2992 vis_st64(TMP0, dest[0]); | |
2993 | |
2994 vis_padd16(TMP18, TMP2, TMP2); | |
2995 vis_st64_2(TMP2, dest, 8); | |
2996 dest += stride; | |
2997 } while (--height); | |
2998 | |
2999 vis_ld64(ref[0], TMP0); | |
3000 vis_xor(REF_0, REF_2, TMP12); | |
3001 | |
3002 vis_ld64_2(ref, 8, TMP2); | |
3003 vis_xor(REF_4, REF_6, TMP16); | |
3004 | |
3005 vis_ld64_2(ref, 16, TMP4); | |
3006 vis_and(REF_0, REF_2, TMP14); | |
3007 | |
3008 vis_and(REF_4, REF_6, TMP18); | |
3009 | |
3010 vis_faligndata(TMP0, TMP2, REF_0); | |
3011 | |
3012 vis_faligndata(TMP2, TMP4, REF_4); | |
3013 | |
3014 vis_and(TMP12, MASK_fe, TMP12); | |
3015 | |
3016 vis_and(TMP16, MASK_fe, TMP16); | |
3017 vis_mul8x16(CONST_128, TMP12, TMP12); | |
3018 | |
3019 vis_mul8x16(CONST_128, TMP16, TMP16); | |
3020 vis_xor(REF_0, REF_2, TMP0); | |
3021 | |
3022 vis_xor(REF_4, REF_6, TMP2); | |
3023 | |
3024 vis_and(REF_0, REF_2, TMP20); | |
3025 | |
3026 vis_and(TMP12, MASK_7f, TMP12); | |
3027 | |
3028 vis_and(TMP16, MASK_7f, TMP16); | |
3029 | |
3030 vis_padd16(TMP14, TMP12, TMP12); | |
3031 vis_st64(TMP12, dest[0]); | |
3032 | |
3033 vis_padd16(TMP18, TMP16, TMP16); | |
3034 vis_st64_2(TMP16, dest, 8); | |
3035 dest += stride; | |
3036 | |
3037 vis_and(REF_4, REF_6, TMP18); | |
3038 | |
3039 vis_and(TMP0, MASK_fe, TMP0); | |
3040 | |
3041 vis_and(TMP2, MASK_fe, TMP2); | |
3042 vis_mul8x16(CONST_128, TMP0, TMP0); | |
3043 | |
3044 vis_mul8x16(CONST_128, TMP2, TMP2); | |
3045 | |
3046 vis_and(TMP0, MASK_7f, TMP0); | |
3047 | |
3048 vis_and(TMP2, MASK_7f, TMP2); | |
3049 | |
3050 vis_padd16(TMP20, TMP0, TMP0); | |
3051 vis_st64(TMP0, dest[0]); | |
3052 | |
3053 vis_padd16(TMP18, TMP2, TMP2); | |
3054 vis_st64_2(TMP2, dest, 8); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3055 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3056 |
11458 | 3057 static void MC_put_no_round_y_8_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 3058 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3059 { |
2979 | 3060 ref = vis_alignaddr(ref); |
3061 vis_ld64(ref[0], TMP0); | |
3062 | |
3063 vis_ld64_2(ref, 8, TMP2); | |
3064 ref += stride; | |
3065 | |
3066 vis_ld64(ref[0], TMP4); | |
3067 | |
3068 vis_ld64_2(ref, 8, TMP6); | |
3069 ref += stride; | |
3070 | |
3071 vis_ld64(constants_fe[0], MASK_fe); | |
3072 vis_faligndata(TMP0, TMP2, REF_0); | |
3073 | |
3074 vis_ld64(constants_7f[0], MASK_7f); | |
3075 vis_faligndata(TMP4, TMP6, REF_2); | |
3076 | |
3077 vis_ld64(constants128[0], CONST_128); | |
3078 height = (height >> 1) - 1; | |
3079 do { /* 12 cycles */ | |
3080 vis_ld64(ref[0], TMP0); | |
3081 vis_xor(REF_0, REF_2, TMP4); | |
3082 | |
3083 vis_ld64_2(ref, 8, TMP2); | |
3084 ref += stride; | |
3085 vis_and(TMP4, MASK_fe, TMP4); | |
3086 | |
3087 vis_and(REF_0, REF_2, TMP6); | |
3088 vis_mul8x16(CONST_128, TMP4, TMP4); | |
3089 | |
3090 vis_faligndata(TMP0, TMP2, REF_0); | |
3091 vis_ld64(ref[0], TMP0); | |
3092 | |
3093 vis_ld64_2(ref, 8, TMP2); | |
3094 ref += stride; | |
3095 vis_xor(REF_0, REF_2, TMP12); | |
3096 | |
3097 vis_and(TMP4, MASK_7f, TMP4); | |
3098 | |
3099 vis_and(TMP12, MASK_fe, TMP12); | |
3100 | |
3101 vis_mul8x16(CONST_128, TMP12, TMP12); | |
3102 vis_and(REF_0, REF_2, TMP14); | |
3103 | |
3104 vis_padd16(TMP6, TMP4, DST_0); | |
3105 vis_st64(DST_0, dest[0]); | |
3106 dest += stride; | |
3107 | |
3108 vis_faligndata(TMP0, TMP2, REF_2); | |
3109 | |
3110 vis_and(TMP12, MASK_7f, TMP12); | |
3111 | |
3112 vis_padd16(TMP14, TMP12, DST_0); | |
3113 vis_st64(DST_0, dest[0]); | |
3114 dest += stride; | |
3115 } while (--height); | |
3116 | |
3117 vis_ld64(ref[0], TMP0); | |
3118 vis_xor(REF_0, REF_2, TMP4); | |
3119 | |
3120 vis_ld64_2(ref, 8, TMP2); | |
3121 vis_and(TMP4, MASK_fe, TMP4); | |
3122 | |
3123 vis_and(REF_0, REF_2, TMP6); | |
3124 vis_mul8x16(CONST_128, TMP4, TMP4); | |
3125 | |
3126 vis_faligndata(TMP0, TMP2, REF_0); | |
3127 | |
3128 vis_xor(REF_0, REF_2, TMP12); | |
3129 | |
3130 vis_and(TMP4, MASK_7f, TMP4); | |
3131 | |
3132 vis_and(TMP12, MASK_fe, TMP12); | |
3133 | |
3134 vis_mul8x16(CONST_128, TMP12, TMP12); | |
3135 vis_and(REF_0, REF_2, TMP14); | |
3136 | |
3137 vis_padd16(TMP6, TMP4, DST_0); | |
3138 vis_st64(DST_0, dest[0]); | |
3139 dest += stride; | |
3140 | |
3141 vis_and(TMP12, MASK_7f, TMP12); | |
3142 | |
3143 vis_padd16(TMP14, TMP12, DST_0); | |
3144 vis_st64(DST_0, dest[0]); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3145 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3146 |
11458 | 3147 static void MC_avg_no_round_y_16_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 3148 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3149 { |
2979 | 3150 int stride_8 = stride + 8; |
3151 int stride_16 = stride + 16; | |
3152 | |
3153 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
3154 | |
3155 ref = vis_alignaddr(ref); | |
3156 | |
3157 vis_ld64(ref[ 0], TMP0); | |
3158 vis_fzero(ZERO); | |
3159 | |
3160 vis_ld64(ref[ 8], TMP2); | |
3161 | |
3162 vis_ld64(ref[16], TMP4); | |
3163 | |
3164 vis_ld64(constants3[0], CONST_3); | |
3165 vis_faligndata(TMP0, TMP2, REF_2); | |
3166 | |
3167 vis_ld64(constants256_512[0], CONST_256); | |
3168 vis_faligndata(TMP2, TMP4, REF_6); | |
3169 height >>= 1; | |
3170 | |
3171 do { /* 31 cycles */ | |
3172 vis_ld64_2(ref, stride, TMP0); | |
3173 vis_pmerge(ZERO, REF_2, TMP12); | |
3174 vis_mul8x16au(REF_2_1, CONST_256, TMP14); | |
3175 | |
3176 vis_ld64_2(ref, stride_8, TMP2); | |
3177 vis_pmerge(ZERO, REF_6, TMP16); | |
3178 vis_mul8x16au(REF_6_1, CONST_256, TMP18); | |
3179 | |
3180 vis_ld64_2(ref, stride_16, TMP4); | |
3181 ref += stride; | |
3182 | |
3183 vis_ld64(dest[0], DST_0); | |
3184 vis_faligndata(TMP0, TMP2, REF_0); | |
3185 | |
3186 vis_ld64_2(dest, 8, DST_2); | |
3187 vis_faligndata(TMP2, TMP4, REF_4); | |
3188 | |
3189 vis_ld64_2(ref, stride, TMP6); | |
3190 vis_pmerge(ZERO, REF_0, TMP0); | |
3191 vis_mul8x16au(REF_0_1, CONST_256, TMP2); | |
3192 | |
3193 vis_ld64_2(ref, stride_8, TMP8); | |
3194 vis_pmerge(ZERO, REF_4, TMP4); | |
3195 | |
3196 vis_ld64_2(ref, stride_16, TMP10); | |
3197 ref += stride; | |
3198 | |
3199 vis_ld64_2(dest, stride, REF_S0/*DST_4*/); | |
3200 vis_faligndata(TMP6, TMP8, REF_2); | |
3201 vis_mul8x16au(REF_4_1, CONST_256, TMP6); | |
3202 | |
3203 vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/); | |
3204 vis_faligndata(TMP8, TMP10, REF_6); | |
3205 vis_mul8x16al(DST_0, CONST_512, TMP20); | |
3206 | |
3207 vis_padd16(TMP0, CONST_3, TMP0); | |
3208 vis_mul8x16al(DST_1, CONST_512, TMP22); | |
3209 | |
3210 vis_padd16(TMP2, CONST_3, TMP2); | |
3211 vis_mul8x16al(DST_2, CONST_512, TMP24); | |
3212 | |
3213 vis_padd16(TMP4, CONST_3, TMP4); | |
3214 vis_mul8x16al(DST_3, CONST_512, TMP26); | |
3215 | |
3216 vis_padd16(TMP6, CONST_3, TMP6); | |
3217 | |
3218 vis_padd16(TMP12, TMP20, TMP12); | |
3219 vis_mul8x16al(REF_S0, CONST_512, TMP20); | |
3220 | |
3221 vis_padd16(TMP14, TMP22, TMP14); | |
3222 vis_mul8x16al(REF_S0_1, CONST_512, TMP22); | |
3223 | |
3224 vis_padd16(TMP16, TMP24, TMP16); | |
3225 vis_mul8x16al(REF_S2, CONST_512, TMP24); | |
3226 | |
3227 vis_padd16(TMP18, TMP26, TMP18); | |
3228 vis_mul8x16al(REF_S2_1, CONST_512, TMP26); | |
3229 | |
3230 vis_padd16(TMP12, TMP0, TMP12); | |
3231 vis_mul8x16au(REF_2, CONST_256, TMP28); | |
3232 | |
3233 vis_padd16(TMP14, TMP2, TMP14); | |
3234 vis_mul8x16au(REF_2_1, CONST_256, TMP30); | |
3235 | |
3236 vis_padd16(TMP16, TMP4, TMP16); | |
3237 vis_mul8x16au(REF_6, CONST_256, REF_S4); | |
3238 | |
3239 vis_padd16(TMP18, TMP6, TMP18); | |
3240 vis_mul8x16au(REF_6_1, CONST_256, REF_S6); | |
3241 | |
3242 vis_pack16(TMP12, DST_0); | |
3243 vis_padd16(TMP28, TMP0, TMP12); | |
3244 | |
3245 vis_pack16(TMP14, DST_1); | |
3246 vis_st64(DST_0, dest[0]); | |
3247 vis_padd16(TMP30, TMP2, TMP14); | |
3248 | |
3249 vis_pack16(TMP16, DST_2); | |
3250 vis_padd16(REF_S4, TMP4, TMP16); | |
3251 | |
3252 vis_pack16(TMP18, DST_3); | |
3253 vis_st64_2(DST_2, dest, 8); | |
3254 dest += stride; | |
3255 vis_padd16(REF_S6, TMP6, TMP18); | |
3256 | |
3257 vis_padd16(TMP12, TMP20, TMP12); | |
3258 | |
3259 vis_padd16(TMP14, TMP22, TMP14); | |
3260 vis_pack16(TMP12, DST_0); | |
3261 | |
3262 vis_padd16(TMP16, TMP24, TMP16); | |
3263 vis_pack16(TMP14, DST_1); | |
3264 vis_st64(DST_0, dest[0]); | |
3265 | |
3266 vis_padd16(TMP18, TMP26, TMP18); | |
3267 vis_pack16(TMP16, DST_2); | |
3268 | |
3269 vis_pack16(TMP18, DST_3); | |
3270 vis_st64_2(DST_2, dest, 8); | |
3271 dest += stride; | |
3272 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3273 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3274 |
11458 | 3275 static void MC_avg_no_round_y_8_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 3276 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3277 { |
2979 | 3278 int stride_8 = stride + 8; |
3279 | |
3280 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
3281 | |
3282 ref = vis_alignaddr(ref); | |
3283 | |
3284 vis_ld64(ref[ 0], TMP0); | |
3285 vis_fzero(ZERO); | |
3286 | |
3287 vis_ld64(ref[ 8], TMP2); | |
3288 | |
3289 vis_ld64(constants3[0], CONST_3); | |
3290 vis_faligndata(TMP0, TMP2, REF_2); | |
3291 | |
3292 vis_ld64(constants256_512[0], CONST_256); | |
3293 | |
3294 height >>= 1; | |
3295 do { /* 20 cycles */ | |
3296 vis_ld64_2(ref, stride, TMP0); | |
3297 vis_pmerge(ZERO, REF_2, TMP8); | |
3298 vis_mul8x16au(REF_2_1, CONST_256, TMP10); | |
3299 | |
3300 vis_ld64_2(ref, stride_8, TMP2); | |
3301 ref += stride; | |
3302 | |
3303 vis_ld64(dest[0], DST_0); | |
3304 | |
3305 vis_ld64_2(dest, stride, DST_2); | |
3306 vis_faligndata(TMP0, TMP2, REF_0); | |
3307 | |
3308 vis_ld64_2(ref, stride, TMP4); | |
3309 vis_mul8x16al(DST_0, CONST_512, TMP16); | |
3310 vis_pmerge(ZERO, REF_0, TMP12); | |
3311 | |
3312 vis_ld64_2(ref, stride_8, TMP6); | |
3313 ref += stride; | |
3314 vis_mul8x16al(DST_1, CONST_512, TMP18); | |
3315 vis_pmerge(ZERO, REF_0_1, TMP14); | |
3316 | |
3317 vis_padd16(TMP12, CONST_3, TMP12); | |
3318 vis_mul8x16al(DST_2, CONST_512, TMP24); | |
3319 | |
3320 vis_padd16(TMP14, CONST_3, TMP14); | |
3321 vis_mul8x16al(DST_3, CONST_512, TMP26); | |
3322 | |
3323 vis_faligndata(TMP4, TMP6, REF_2); | |
3324 | |
3325 vis_padd16(TMP8, TMP12, TMP8); | |
3326 | |
3327 vis_padd16(TMP10, TMP14, TMP10); | |
3328 vis_mul8x16au(REF_2, CONST_256, TMP20); | |
3329 | |
3330 vis_padd16(TMP8, TMP16, TMP0); | |
3331 vis_mul8x16au(REF_2_1, CONST_256, TMP22); | |
3332 | |
3333 vis_padd16(TMP10, TMP18, TMP2); | |
3334 vis_pack16(TMP0, DST_0); | |
3335 | |
3336 vis_pack16(TMP2, DST_1); | |
3337 vis_st64(DST_0, dest[0]); | |
3338 dest += stride; | |
3339 vis_padd16(TMP12, TMP20, TMP12); | |
3340 | |
3341 vis_padd16(TMP14, TMP22, TMP14); | |
3342 | |
3343 vis_padd16(TMP12, TMP24, TMP0); | |
3344 | |
3345 vis_padd16(TMP14, TMP26, TMP2); | |
3346 vis_pack16(TMP0, DST_2); | |
3347 | |
3348 vis_pack16(TMP2, DST_3); | |
3349 vis_st64(DST_2, dest[0]); | |
3350 dest += stride; | |
3351 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3352 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3353 |
11458 | 3354 static void MC_put_no_round_xy_16_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 3355 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3356 { |
2979 | 3357 unsigned long off = (unsigned long) ref & 0x7; |
3358 unsigned long off_plus_1 = off + 1; | |
3359 int stride_8 = stride + 8; | |
3360 int stride_16 = stride + 16; | |
3361 | |
3362 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
3363 | |
3364 ref = vis_alignaddr(ref); | |
3365 | |
3366 vis_ld64(ref[ 0], TMP0); | |
3367 vis_fzero(ZERO); | |
3368 | |
3369 vis_ld64(ref[ 8], TMP2); | |
3370 | |
3371 vis_ld64(ref[16], TMP4); | |
3372 | |
3373 vis_ld64(constants1[0], CONST_1); | |
3374 vis_faligndata(TMP0, TMP2, REF_S0); | |
3375 | |
3376 vis_ld64(constants256_512[0], CONST_256); | |
3377 vis_faligndata(TMP2, TMP4, REF_S4); | |
3378 | |
3379 if (off != 0x7) { | |
3380 vis_alignaddr_g0((void *)off_plus_1); | |
3381 vis_faligndata(TMP0, TMP2, REF_S2); | |
3382 vis_faligndata(TMP2, TMP4, REF_S6); | |
3383 } else { | |
3384 vis_src1(TMP2, REF_S2); | |
3385 vis_src1(TMP4, REF_S6); | |
3386 } | |
3387 | |
3388 height >>= 1; | |
3389 do { | |
3390 vis_ld64_2(ref, stride, TMP0); | |
3391 vis_mul8x16au(REF_S0, CONST_256, TMP12); | |
3392 vis_pmerge(ZERO, REF_S0_1, TMP14); | |
3393 | |
3394 vis_alignaddr_g0((void *)off); | |
3395 | |
3396 vis_ld64_2(ref, stride_8, TMP2); | |
3397 vis_mul8x16au(REF_S2, CONST_256, TMP16); | |
3398 vis_pmerge(ZERO, REF_S2_1, TMP18); | |
3399 | |
3400 vis_ld64_2(ref, stride_16, TMP4); | |
3401 ref += stride; | |
3402 vis_mul8x16au(REF_S4, CONST_256, TMP20); | |
3403 vis_pmerge(ZERO, REF_S4_1, TMP22); | |
3404 | |
3405 vis_ld64_2(ref, stride, TMP6); | |
3406 vis_mul8x16au(REF_S6, CONST_256, TMP24); | |
3407 vis_pmerge(ZERO, REF_S6_1, TMP26); | |
3408 | |
3409 vis_ld64_2(ref, stride_8, TMP8); | |
3410 vis_faligndata(TMP0, TMP2, REF_0); | |
3411 | |
3412 vis_ld64_2(ref, stride_16, TMP10); | |
3413 ref += stride; | |
3414 vis_faligndata(TMP2, TMP4, REF_4); | |
3415 | |
3416 vis_faligndata(TMP6, TMP8, REF_S0); | |
3417 | |
3418 vis_faligndata(TMP8, TMP10, REF_S4); | |
3419 | |
3420 if (off != 0x7) { | |
3421 vis_alignaddr_g0((void *)off_plus_1); | |
3422 vis_faligndata(TMP0, TMP2, REF_2); | |
3423 vis_faligndata(TMP2, TMP4, REF_6); | |
3424 vis_faligndata(TMP6, TMP8, REF_S2); | |
3425 vis_faligndata(TMP8, TMP10, REF_S6); | |
3426 } else { | |
3427 vis_src1(TMP2, REF_2); | |
3428 vis_src1(TMP4, REF_6); | |
3429 vis_src1(TMP8, REF_S2); | |
3430 vis_src1(TMP10, REF_S6); | |
3431 } | |
3432 | |
3433 vis_mul8x16au(REF_0, CONST_256, TMP0); | |
3434 vis_pmerge(ZERO, REF_0_1, TMP2); | |
3435 | |
3436 vis_mul8x16au(REF_2, CONST_256, TMP4); | |
3437 vis_pmerge(ZERO, REF_2_1, TMP6); | |
3438 | |
3439 vis_padd16(TMP0, CONST_2, TMP8); | |
3440 vis_mul8x16au(REF_4, CONST_256, TMP0); | |
3441 | |
3442 vis_padd16(TMP2, CONST_1, TMP10); | |
3443 vis_mul8x16au(REF_4_1, CONST_256, TMP2); | |
3444 | |
3445 vis_padd16(TMP8, TMP4, TMP8); | |
3446 vis_mul8x16au(REF_6, CONST_256, TMP4); | |
3447 | |
3448 vis_padd16(TMP10, TMP6, TMP10); | |
3449 vis_mul8x16au(REF_6_1, CONST_256, TMP6); | |
3450 | |
3451 vis_padd16(TMP12, TMP8, TMP12); | |
3452 | |
3453 vis_padd16(TMP14, TMP10, TMP14); | |
3454 | |
3455 vis_padd16(TMP12, TMP16, TMP12); | |
3456 | |
3457 vis_padd16(TMP14, TMP18, TMP14); | |
3458 vis_pack16(TMP12, DST_0); | |
3459 | |
3460 vis_pack16(TMP14, DST_1); | |
3461 vis_st64(DST_0, dest[0]); | |
3462 vis_padd16(TMP0, CONST_1, TMP12); | |
3463 | |
3464 vis_mul8x16au(REF_S0, CONST_256, TMP0); | |
3465 vis_padd16(TMP2, CONST_1, TMP14); | |
3466 | |
3467 vis_mul8x16au(REF_S0_1, CONST_256, TMP2); | |
3468 vis_padd16(TMP12, TMP4, TMP12); | |
3469 | |
3470 vis_mul8x16au(REF_S2, CONST_256, TMP4); | |
3471 vis_padd16(TMP14, TMP6, TMP14); | |
3472 | |
3473 vis_mul8x16au(REF_S2_1, CONST_256, TMP6); | |
3474 vis_padd16(TMP20, TMP12, TMP20); | |
3475 | |
3476 vis_padd16(TMP22, TMP14, TMP22); | |
3477 | |
3478 vis_padd16(TMP20, TMP24, TMP20); | |
3479 | |
3480 vis_padd16(TMP22, TMP26, TMP22); | |
3481 vis_pack16(TMP20, DST_2); | |
3482 | |
3483 vis_pack16(TMP22, DST_3); | |
3484 vis_st64_2(DST_2, dest, 8); | |
3485 dest += stride; | |
3486 vis_padd16(TMP0, TMP4, TMP24); | |
3487 | |
3488 vis_mul8x16au(REF_S4, CONST_256, TMP0); | |
3489 vis_padd16(TMP2, TMP6, TMP26); | |
3490 | |
3491 vis_mul8x16au(REF_S4_1, CONST_256, TMP2); | |
3492 vis_padd16(TMP24, TMP8, TMP24); | |
3493 | |
3494 vis_padd16(TMP26, TMP10, TMP26); | |
3495 vis_pack16(TMP24, DST_0); | |
3496 | |
3497 vis_pack16(TMP26, DST_1); | |
3498 vis_st64(DST_0, dest[0]); | |
3499 vis_pmerge(ZERO, REF_S6, TMP4); | |
3500 | |
3501 vis_pmerge(ZERO, REF_S6_1, TMP6); | |
3502 | |
3503 vis_padd16(TMP0, TMP4, TMP0); | |
3504 | |
3505 vis_padd16(TMP2, TMP6, TMP2); | |
3506 | |
3507 vis_padd16(TMP0, TMP12, TMP0); | |
3508 | |
3509 vis_padd16(TMP2, TMP14, TMP2); | |
3510 vis_pack16(TMP0, DST_2); | |
3511 | |
3512 vis_pack16(TMP2, DST_3); | |
3513 vis_st64_2(DST_2, dest, 8); | |
3514 dest += stride; | |
3515 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3516 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3517 |
11458 | 3518 static void MC_put_no_round_xy_8_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 3519 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3520 { |
2979 | 3521 unsigned long off = (unsigned long) ref & 0x7; |
3522 unsigned long off_plus_1 = off + 1; | |
3523 int stride_8 = stride + 8; | |
3524 | |
3525 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
3526 | |
3527 ref = vis_alignaddr(ref); | |
3528 | |
3529 vis_ld64(ref[ 0], TMP0); | |
3530 vis_fzero(ZERO); | |
3531 | |
3532 vis_ld64(ref[ 8], TMP2); | |
3533 | |
3534 vis_ld64(constants1[0], CONST_1); | |
3535 | |
3536 vis_ld64(constants256_512[0], CONST_256); | |
3537 vis_faligndata(TMP0, TMP2, REF_S0); | |
3538 | |
3539 if (off != 0x7) { | |
3540 vis_alignaddr_g0((void *)off_plus_1); | |
3541 vis_faligndata(TMP0, TMP2, REF_S2); | |
3542 } else { | |
3543 vis_src1(TMP2, REF_S2); | |
3544 } | |
3545 | |
3546 height >>= 1; | |
3547 do { /* 26 cycles */ | |
3548 vis_ld64_2(ref, stride, TMP0); | |
3549 vis_mul8x16au(REF_S0, CONST_256, TMP8); | |
3550 vis_pmerge(ZERO, REF_S2, TMP12); | |
3551 | |
3552 vis_alignaddr_g0((void *)off); | |
3553 | |
3554 vis_ld64_2(ref, stride_8, TMP2); | |
3555 ref += stride; | |
3556 vis_mul8x16au(REF_S0_1, CONST_256, TMP10); | |
3557 vis_pmerge(ZERO, REF_S2_1, TMP14); | |
3558 | |
3559 vis_ld64_2(ref, stride, TMP4); | |
3560 | |
3561 vis_ld64_2(ref, stride_8, TMP6); | |
3562 ref += stride; | |
3563 vis_faligndata(TMP0, TMP2, REF_S4); | |
3564 | |
3565 vis_pmerge(ZERO, REF_S4, TMP18); | |
3566 | |
3567 vis_pmerge(ZERO, REF_S4_1, TMP20); | |
3568 | |
3569 vis_faligndata(TMP4, TMP6, REF_S0); | |
3570 | |
3571 if (off != 0x7) { | |
3572 vis_alignaddr_g0((void *)off_plus_1); | |
3573 vis_faligndata(TMP0, TMP2, REF_S6); | |
3574 vis_faligndata(TMP4, TMP6, REF_S2); | |
3575 } else { | |
3576 vis_src1(TMP2, REF_S6); | |
3577 vis_src1(TMP6, REF_S2); | |
3578 } | |
3579 | |
3580 vis_padd16(TMP18, CONST_1, TMP18); | |
3581 vis_mul8x16au(REF_S6, CONST_256, TMP22); | |
3582 | |
3583 vis_padd16(TMP20, CONST_1, TMP20); | |
3584 vis_mul8x16au(REF_S6_1, CONST_256, TMP24); | |
3585 | |
3586 vis_mul8x16au(REF_S0, CONST_256, TMP26); | |
3587 vis_pmerge(ZERO, REF_S0_1, TMP28); | |
3588 | |
3589 vis_mul8x16au(REF_S2, CONST_256, TMP30); | |
3590 vis_padd16(TMP18, TMP22, TMP18); | |
3591 | |
3592 vis_mul8x16au(REF_S2_1, CONST_256, TMP32); | |
3593 vis_padd16(TMP20, TMP24, TMP20); | |
3594 | |
3595 vis_padd16(TMP8, TMP18, TMP8); | |
3596 | |
3597 vis_padd16(TMP10, TMP20, TMP10); | |
3598 | |
3599 vis_padd16(TMP8, TMP12, TMP8); | |
3600 | |
3601 vis_padd16(TMP10, TMP14, TMP10); | |
3602 vis_pack16(TMP8, DST_0); | |
3603 | |
3604 vis_pack16(TMP10, DST_1); | |
3605 vis_st64(DST_0, dest[0]); | |
3606 dest += stride; | |
3607 vis_padd16(TMP18, TMP26, TMP18); | |
3608 | |
3609 vis_padd16(TMP20, TMP28, TMP20); | |
3610 | |
3611 vis_padd16(TMP18, TMP30, TMP18); | |
3612 | |
3613 vis_padd16(TMP20, TMP32, TMP20); | |
3614 vis_pack16(TMP18, DST_2); | |
3615 | |
3616 vis_pack16(TMP20, DST_3); | |
3617 vis_st64(DST_2, dest[0]); | |
3618 dest += stride; | |
3619 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3620 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3621 |
11458 | 3622 static void MC_avg_no_round_xy_16_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 3623 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3624 { |
2979 | 3625 unsigned long off = (unsigned long) ref & 0x7; |
3626 unsigned long off_plus_1 = off + 1; | |
3627 int stride_8 = stride + 8; | |
3628 int stride_16 = stride + 16; | |
3629 | |
3630 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT); | |
3631 | |
3632 ref = vis_alignaddr(ref); | |
3633 | |
3634 vis_ld64(ref[ 0], TMP0); | |
3635 vis_fzero(ZERO); | |
3636 | |
3637 vis_ld64(ref[ 8], TMP2); | |
3638 | |
3639 vis_ld64(ref[16], TMP4); | |
3640 | |
3641 vis_ld64(constants6[0], CONST_6); | |
3642 vis_faligndata(TMP0, TMP2, REF_S0); | |
3643 | |
3644 vis_ld64(constants256_1024[0], CONST_256); | |
3645 vis_faligndata(TMP2, TMP4, REF_S4); | |
3646 | |
3647 if (off != 0x7) { | |
3648 vis_alignaddr_g0((void *)off_plus_1); | |
3649 vis_faligndata(TMP0, TMP2, REF_S2); | |
3650 vis_faligndata(TMP2, TMP4, REF_S6); | |
3651 } else { | |
3652 vis_src1(TMP2, REF_S2); | |
3653 vis_src1(TMP4, REF_S6); | |
3654 } | |
3655 | |
3656 height >>= 1; | |
3657 do { /* 55 cycles */ | |
3658 vis_ld64_2(ref, stride, TMP0); | |
3659 vis_mul8x16au(REF_S0, CONST_256, TMP12); | |
3660 vis_pmerge(ZERO, REF_S0_1, TMP14); | |
3661 | |
3662 vis_alignaddr_g0((void *)off); | |
3663 | |
3664 vis_ld64_2(ref, stride_8, TMP2); | |
3665 vis_mul8x16au(REF_S2, CONST_256, TMP16); | |
3666 vis_pmerge(ZERO, REF_S2_1, TMP18); | |
3667 | |
3668 vis_ld64_2(ref, stride_16, TMP4); | |
3669 ref += stride; | |
3670 vis_mul8x16au(REF_S4, CONST_256, TMP20); | |
3671 vis_pmerge(ZERO, REF_S4_1, TMP22); | |
3672 | |
3673 vis_ld64_2(ref, stride, TMP6); | |
3674 vis_mul8x16au(REF_S6, CONST_256, TMP24); | |
3675 vis_pmerge(ZERO, REF_S6_1, TMP26); | |
3676 | |
3677 vis_ld64_2(ref, stride_8, TMP8); | |
3678 vis_faligndata(TMP0, TMP2, REF_0); | |
3679 | |
3680 vis_ld64_2(ref, stride_16, TMP10); | |
3681 ref += stride; | |
3682 vis_faligndata(TMP2, TMP4, REF_4); | |
3683 | |
3684 vis_ld64(dest[0], DST_0); | |
3685 vis_faligndata(TMP6, TMP8, REF_S0); | |
3686 | |
3687 vis_ld64_2(dest, 8, DST_2); | |
3688 vis_faligndata(TMP8, TMP10, REF_S4); | |
3689 | |
3690 if (off != 0x7) { | |
3691 vis_alignaddr_g0((void *)off_plus_1); | |
3692 vis_faligndata(TMP0, TMP2, REF_2); | |
3693 vis_faligndata(TMP2, TMP4, REF_6); | |
3694 vis_faligndata(TMP6, TMP8, REF_S2); | |
3695 vis_faligndata(TMP8, TMP10, REF_S6); | |
3696 } else { | |
3697 vis_src1(TMP2, REF_2); | |
3698 vis_src1(TMP4, REF_6); | |
3699 vis_src1(TMP8, REF_S2); | |
3700 vis_src1(TMP10, REF_S6); | |
3701 } | |
3702 | |
3703 vis_mul8x16al(DST_0, CONST_1024, TMP30); | |
3704 vis_pmerge(ZERO, REF_0, TMP0); | |
3705 | |
3706 vis_mul8x16al(DST_1, CONST_1024, TMP32); | |
3707 vis_pmerge(ZERO, REF_0_1, TMP2); | |
3708 | |
3709 vis_mul8x16au(REF_2, CONST_256, TMP4); | |
3710 vis_pmerge(ZERO, REF_2_1, TMP6); | |
3711 | |
3712 vis_mul8x16al(DST_2, CONST_1024, REF_0); | |
3713 vis_padd16(TMP0, CONST_6, TMP0); | |
3714 | |
3715 vis_mul8x16al(DST_3, CONST_1024, REF_2); | |
3716 vis_padd16(TMP2, CONST_6, TMP2); | |
3717 | |
3718 vis_padd16(TMP0, TMP4, TMP0); | |
3719 vis_mul8x16au(REF_4, CONST_256, TMP4); | |
3720 | |
3721 vis_padd16(TMP2, TMP6, TMP2); | |
3722 vis_mul8x16au(REF_4_1, CONST_256, TMP6); | |
3723 | |
3724 vis_padd16(TMP12, TMP0, TMP12); | |
3725 vis_mul8x16au(REF_6, CONST_256, TMP8); | |
3726 | |
3727 vis_padd16(TMP14, TMP2, TMP14); | |
3728 vis_mul8x16au(REF_6_1, CONST_256, TMP10); | |
3729 | |
3730 vis_padd16(TMP12, TMP16, TMP12); | |
3731 vis_mul8x16au(REF_S0, CONST_256, REF_4); | |
3732 | |
3733 vis_padd16(TMP14, TMP18, TMP14); | |
3734 vis_mul8x16au(REF_S0_1, CONST_256, REF_6); | |
3735 | |
3736 vis_padd16(TMP12, TMP30, TMP12); | |
3737 | |
3738 vis_padd16(TMP14, TMP32, TMP14); | |
3739 vis_pack16(TMP12, DST_0); | |
3740 | |
3741 vis_pack16(TMP14, DST_1); | |
3742 vis_st64(DST_0, dest[0]); | |
3743 vis_padd16(TMP4, CONST_6, TMP4); | |
3744 | |
3745 vis_ld64_2(dest, stride, DST_0); | |
3746 vis_padd16(TMP6, CONST_6, TMP6); | |
3747 vis_mul8x16au(REF_S2, CONST_256, TMP12); | |
3748 | |
3749 vis_padd16(TMP4, TMP8, TMP4); | |
3750 vis_mul8x16au(REF_S2_1, CONST_256, TMP14); | |
3751 | |
3752 vis_padd16(TMP6, TMP10, TMP6); | |
3753 | |
3754 vis_padd16(TMP20, TMP4, TMP20); | |
3755 | |
3756 vis_padd16(TMP22, TMP6, TMP22); | |
3757 | |
3758 vis_padd16(TMP20, TMP24, TMP20); | |
3759 | |
3760 vis_padd16(TMP22, TMP26, TMP22); | |
3761 | |
3762 vis_padd16(TMP20, REF_0, TMP20); | |
3763 vis_mul8x16au(REF_S4, CONST_256, REF_0); | |
3764 | |
3765 vis_padd16(TMP22, REF_2, TMP22); | |
3766 vis_pack16(TMP20, DST_2); | |
3767 | |
3768 vis_pack16(TMP22, DST_3); | |
3769 vis_st64_2(DST_2, dest, 8); | |
3770 dest += stride; | |
3771 | |
3772 vis_ld64_2(dest, 8, DST_2); | |
3773 vis_mul8x16al(DST_0, CONST_1024, TMP30); | |
3774 vis_pmerge(ZERO, REF_S4_1, REF_2); | |
3775 | |
3776 vis_mul8x16al(DST_1, CONST_1024, TMP32); | |
3777 vis_padd16(REF_4, TMP0, TMP8); | |
3778 | |
3779 vis_mul8x16au(REF_S6, CONST_256, REF_4); | |
3780 vis_padd16(REF_6, TMP2, TMP10); | |
3781 | |
3782 vis_mul8x16au(REF_S6_1, CONST_256, REF_6); | |
3783 vis_padd16(TMP8, TMP12, TMP8); | |
3784 | |
3785 vis_padd16(TMP10, TMP14, TMP10); | |
3786 | |
3787 vis_padd16(TMP8, TMP30, TMP8); | |
3788 | |
3789 vis_padd16(TMP10, TMP32, TMP10); | |
3790 vis_pack16(TMP8, DST_0); | |
3791 | |
3792 vis_pack16(TMP10, DST_1); | |
3793 vis_st64(DST_0, dest[0]); | |
3794 | |
3795 vis_padd16(REF_0, TMP4, REF_0); | |
3796 | |
3797 vis_mul8x16al(DST_2, CONST_1024, TMP30); | |
3798 vis_padd16(REF_2, TMP6, REF_2); | |
3799 | |
3800 vis_mul8x16al(DST_3, CONST_1024, TMP32); | |
3801 vis_padd16(REF_0, REF_4, REF_0); | |
3802 | |
3803 vis_padd16(REF_2, REF_6, REF_2); | |
3804 | |
3805 vis_padd16(REF_0, TMP30, REF_0); | |
3806 | |
3807 /* stall */ | |
3808 | |
3809 vis_padd16(REF_2, TMP32, REF_2); | |
3810 vis_pack16(REF_0, DST_2); | |
3811 | |
3812 vis_pack16(REF_2, DST_3); | |
3813 vis_st64_2(DST_2, dest, 8); | |
3814 dest += stride; | |
3815 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3816 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3817 |
11458 | 3818 static void MC_avg_no_round_xy_8_vis (uint8_t * dest, const uint8_t * ref, |
2979 | 3819 const int stride, int height) |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3820 { |
2979 | 3821 unsigned long off = (unsigned long) ref & 0x7; |
3822 unsigned long off_plus_1 = off + 1; | |
3823 int stride_8 = stride + 8; | |
3824 | |
3825 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT); | |
3826 | |
3827 ref = vis_alignaddr(ref); | |
3828 | |
3829 vis_ld64(ref[0], TMP0); | |
3830 vis_fzero(ZERO); | |
3831 | |
3832 vis_ld64_2(ref, 8, TMP2); | |
3833 | |
3834 vis_ld64(constants6[0], CONST_6); | |
3835 | |
3836 vis_ld64(constants256_1024[0], CONST_256); | |
3837 vis_faligndata(TMP0, TMP2, REF_S0); | |
3838 | |
3839 if (off != 0x7) { | |
3840 vis_alignaddr_g0((void *)off_plus_1); | |
3841 vis_faligndata(TMP0, TMP2, REF_S2); | |
3842 } else { | |
3843 vis_src1(TMP2, REF_S2); | |
3844 } | |
3845 | |
3846 height >>= 1; | |
3847 do { /* 31 cycles */ | |
3848 vis_ld64_2(ref, stride, TMP0); | |
3849 vis_mul8x16au(REF_S0, CONST_256, TMP8); | |
3850 vis_pmerge(ZERO, REF_S0_1, TMP10); | |
3851 | |
3852 vis_ld64_2(ref, stride_8, TMP2); | |
3853 ref += stride; | |
3854 vis_mul8x16au(REF_S2, CONST_256, TMP12); | |
3855 vis_pmerge(ZERO, REF_S2_1, TMP14); | |
3856 | |
3857 vis_alignaddr_g0((void *)off); | |
3858 | |
3859 vis_ld64_2(ref, stride, TMP4); | |
3860 vis_faligndata(TMP0, TMP2, REF_S4); | |
3861 | |
3862 vis_ld64_2(ref, stride_8, TMP6); | |
3863 ref += stride; | |
3864 | |
3865 vis_ld64(dest[0], DST_0); | |
3866 vis_faligndata(TMP4, TMP6, REF_S0); | |
3867 | |
3868 vis_ld64_2(dest, stride, DST_2); | |
3869 | |
3870 if (off != 0x7) { | |
3871 vis_alignaddr_g0((void *)off_plus_1); | |
3872 vis_faligndata(TMP0, TMP2, REF_S6); | |
3873 vis_faligndata(TMP4, TMP6, REF_S2); | |
3874 } else { | |
3875 vis_src1(TMP2, REF_S6); | |
3876 vis_src1(TMP6, REF_S2); | |
3877 } | |
3878 | |
3879 vis_mul8x16al(DST_0, CONST_1024, TMP30); | |
3880 vis_pmerge(ZERO, REF_S4, TMP22); | |
3881 | |
3882 vis_mul8x16al(DST_1, CONST_1024, TMP32); | |
3883 vis_pmerge(ZERO, REF_S4_1, TMP24); | |
3884 | |
3885 vis_mul8x16au(REF_S6, CONST_256, TMP26); | |
3886 vis_pmerge(ZERO, REF_S6_1, TMP28); | |
3887 | |
3888 vis_mul8x16au(REF_S0, CONST_256, REF_S4); | |
3889 vis_padd16(TMP22, CONST_6, TMP22); | |
3890 | |
3891 vis_mul8x16au(REF_S0_1, CONST_256, REF_S6); | |
3892 vis_padd16(TMP24, CONST_6, TMP24); | |
3893 | |
3894 vis_mul8x16al(DST_2, CONST_1024, REF_0); | |
3895 vis_padd16(TMP22, TMP26, TMP22); | |
3896 | |
3897 vis_mul8x16al(DST_3, CONST_1024, REF_2); | |
3898 vis_padd16(TMP24, TMP28, TMP24); | |
3899 | |
3900 vis_mul8x16au(REF_S2, CONST_256, TMP26); | |
3901 vis_padd16(TMP8, TMP22, TMP8); | |
3902 | |
3903 vis_mul8x16au(REF_S2_1, CONST_256, TMP28); | |
3904 vis_padd16(TMP10, TMP24, TMP10); | |
3905 | |
3906 vis_padd16(TMP8, TMP12, TMP8); | |
3907 | |
3908 vis_padd16(TMP10, TMP14, TMP10); | |
3909 | |
3910 vis_padd16(TMP8, TMP30, TMP8); | |
3911 | |
3912 vis_padd16(TMP10, TMP32, TMP10); | |
3913 vis_pack16(TMP8, DST_0); | |
3914 | |
3915 vis_pack16(TMP10, DST_1); | |
3916 vis_st64(DST_0, dest[0]); | |
3917 dest += stride; | |
3918 | |
3919 vis_padd16(REF_S4, TMP22, TMP12); | |
3920 | |
3921 vis_padd16(REF_S6, TMP24, TMP14); | |
3922 | |
3923 vis_padd16(TMP12, TMP26, TMP12); | |
3924 | |
3925 vis_padd16(TMP14, TMP28, TMP14); | |
3926 | |
3927 vis_padd16(TMP12, REF_0, TMP12); | |
3928 | |
3929 vis_padd16(TMP14, REF_2, TMP14); | |
3930 vis_pack16(TMP12, DST_2); | |
3931 | |
3932 vis_pack16(TMP14, DST_3); | |
3933 vis_st64(DST_2, dest[0]); | |
3934 dest += stride; | |
3935 } while (--height); | |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3936 } |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3937 |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3938 /* End of no rounding code */ |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3939 |
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3940 #define ACCEL_SPARC_VIS 1 |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3941 #define ACCEL_SPARC_VIS2 2 |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3942 |
8693
18737839ed27
Add missing void keyword to parameterless function declarations.
diego
parents:
8250
diff
changeset
|
3943 static int vis_level(void) |
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3944 { |
5758
a73ecbffc421
Hopefully fix sparc compilation again, add incorrectly removed variable declaration
reimar
parents:
5757
diff
changeset
|
3945 int accel = 0; |
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3946 accel |= ACCEL_SPARC_VIS; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3947 accel |= ACCEL_SPARC_VIS2; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3948 return accel; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3949 } |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3950 |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3951 /* libavcodec initialization code */ |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3952 void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx) |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3953 { |
5967
15ed47af1838
Misc spelling fixes, prefer American over British English.
diego
parents:
5963
diff
changeset
|
3954 /* VIS-specific optimizations */ |
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3955 int accel = vis_level (); |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3956 |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3957 if (accel & ACCEL_SPARC_VIS) { |
5618 | 3958 if(avctx->idct_algo==FF_IDCT_SIMPLEVIS){ |
3959 c->idct_put = ff_simple_idct_put_vis; | |
3960 c->idct_add = ff_simple_idct_add_vis; | |
3961 c->idct = ff_simple_idct_vis; | |
3962 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; | |
3963 } | |
3964 | |
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3965 c->put_pixels_tab[0][0] = MC_put_o_16_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3966 c->put_pixels_tab[0][1] = MC_put_x_16_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3967 c->put_pixels_tab[0][2] = MC_put_y_16_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3968 c->put_pixels_tab[0][3] = MC_put_xy_16_vis; |
2967 | 3969 |
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3970 c->put_pixels_tab[1][0] = MC_put_o_8_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3971 c->put_pixels_tab[1][1] = MC_put_x_8_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3972 c->put_pixels_tab[1][2] = MC_put_y_8_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3973 c->put_pixels_tab[1][3] = MC_put_xy_8_vis; |
2967 | 3974 |
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3975 c->avg_pixels_tab[0][0] = MC_avg_o_16_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3976 c->avg_pixels_tab[0][1] = MC_avg_x_16_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3977 c->avg_pixels_tab[0][2] = MC_avg_y_16_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3978 c->avg_pixels_tab[0][3] = MC_avg_xy_16_vis; |
2967 | 3979 |
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3980 c->avg_pixels_tab[1][0] = MC_avg_o_8_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3981 c->avg_pixels_tab[1][1] = MC_avg_x_8_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3982 c->avg_pixels_tab[1][2] = MC_avg_y_8_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3983 c->avg_pixels_tab[1][3] = MC_avg_xy_8_vis; |
2967 | 3984 |
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3985 c->put_no_rnd_pixels_tab[0][0] = MC_put_no_round_o_16_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3986 c->put_no_rnd_pixels_tab[0][1] = MC_put_no_round_x_16_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3987 c->put_no_rnd_pixels_tab[0][2] = MC_put_no_round_y_16_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3988 c->put_no_rnd_pixels_tab[0][3] = MC_put_no_round_xy_16_vis; |
2967 | 3989 |
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3990 c->put_no_rnd_pixels_tab[1][0] = MC_put_no_round_o_8_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3991 c->put_no_rnd_pixels_tab[1][1] = MC_put_no_round_x_8_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3992 c->put_no_rnd_pixels_tab[1][2] = MC_put_no_round_y_8_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3993 c->put_no_rnd_pixels_tab[1][3] = MC_put_no_round_xy_8_vis; |
2967 | 3994 |
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3995 c->avg_no_rnd_pixels_tab[0][0] = MC_avg_no_round_o_16_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3996 c->avg_no_rnd_pixels_tab[0][1] = MC_avg_no_round_x_16_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3997 c->avg_no_rnd_pixels_tab[0][2] = MC_avg_no_round_y_16_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3998 c->avg_no_rnd_pixels_tab[0][3] = MC_avg_no_round_xy_16_vis; |
2967 | 3999 |
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4000 c->avg_no_rnd_pixels_tab[1][0] = MC_avg_no_round_o_8_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4001 c->avg_no_rnd_pixels_tab[1][1] = MC_avg_no_round_x_8_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4002 c->avg_no_rnd_pixels_tab[1][2] = MC_avg_no_round_y_8_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4003 c->avg_no_rnd_pixels_tab[1][3] = MC_avg_no_round_xy_8_vis; |
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4004 } |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
4005 } |