Mercurial > libavcodec.hg
comparison alpha/dsputil_alpha.c @ 214:73df666cacc7 libavcodec
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
author | nickols_k |
---|---|
date | Sun, 20 Jan 2002 14:48:02 +0000 |
parents | |
children | 718a22dc121f |
comparison
equal
deleted
inserted
replaced
213:e80ad397d30e | 214:73df666cacc7 |
---|---|
1 /* | |
2 * Alpha optimized DSP utils | |
3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org> | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
18 */ | |
19 | |
20 #include "asm.h" | |
21 #include "../dsputil.h" | |
22 | |
23 void simple_idct_axp(DCTELEM *block); | |
24 | |
25 static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, | |
26 int line_size) | |
27 { | |
28 int i = 8; | |
29 do { | |
30 UINT64 shorts; | |
31 | |
32 shorts = ldq(block); | |
33 shorts = maxsw4(shorts, 0); | |
34 shorts = minsw4(shorts, WORD_VEC(0x00ff)); | |
35 stl(pkwb(shorts), pixels); | |
36 | |
37 shorts = ldq(block + 4); | |
38 shorts = maxsw4(shorts, 0); | |
39 shorts = minsw4(shorts, WORD_VEC(0x00ff)); | |
40 stl(pkwb(shorts), pixels + 4); | |
41 | |
42 pixels += line_size; | |
43 block += 8; | |
44 } while (--i); | |
45 } | |
46 | |
47 static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, | |
48 int line_size) | |
49 { | |
50 int i = 8; | |
51 do { | |
52 UINT64 shorts; | |
53 | |
54 shorts = ldq(block); | |
55 shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */ | |
56 shorts += unpkbw(ldl(pixels)); | |
57 shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */ | |
58 shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */ | |
59 shorts &= ~WORD_VEC(0x4000); /* ...and zap them */ | |
60 shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */ | |
61 stl(pkwb(shorts), pixels); | |
62 | |
63 /* next 4 */ | |
64 shorts = ldq(block + 4); | |
65 shorts &= ~WORD_VEC(0x8000); | |
66 shorts += unpkbw(ldl(pixels + 4)); | |
67 shorts &= ~WORD_VEC(0x8000); | |
68 shorts = minuw4(shorts, WORD_VEC(0x4000)); | |
69 shorts &= ~WORD_VEC(0x4000); | |
70 shorts = minsw4(shorts, WORD_VEC(0x00ff)); | |
71 stl(pkwb(shorts), pixels + 4); | |
72 | |
73 pixels += line_size; | |
74 block += 8; | |
75 } while (--i); | |
76 } | |
77 | |
78 /* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1 | |
79 Since the immediate result could be greater than 255, we do the | |
80 shift first. The result is too low by one if the bytes were both | |
81 odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */ | |
82 static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2) | |
83 { | |
84 UINT64 correction = (l1 & l2) & BYTE_VEC(0x01); | |
85 l1 = (l1 & ~BYTE_VEC(0x01)) >> 1; | |
86 l2 = (l2 & ~BYTE_VEC(0x01)) >> 1; | |
87 return l1 + l2 + correction; | |
88 } | |
89 | |
90 /* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1 | |
91 The '1' only has an effect when one byte is even and the other odd, | |
92 i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01). | |
93 Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01). */ | |
94 static inline UINT64 avg2(UINT64 l1, UINT64 l2) | |
95 { | |
96 UINT64 correction = (l1 | l2) & BYTE_VEC(0x01); | |
97 l1 = (l1 & ~BYTE_VEC(0x01)) >> 1; | |
98 l2 = (l2 & ~BYTE_VEC(0x01)) >> 1; | |
99 return l1 + l2 + correction; | |
100 } | |
101 | |
102 static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4) | |
103 { | |
104 UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) | |
105 + ((l2 & ~BYTE_VEC(0x03)) >> 2) | |
106 + ((l3 & ~BYTE_VEC(0x03)) >> 2) | |
107 + ((l4 & ~BYTE_VEC(0x03)) >> 2); | |
108 UINT64 r2 = (( (l1 & BYTE_VEC(0x03)) | |
109 + (l2 & BYTE_VEC(0x03)) | |
110 + (l3 & BYTE_VEC(0x03)) | |
111 + (l4 & BYTE_VEC(0x03)) | |
112 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); | |
113 return r1 + r2; | |
114 } | |
115 | |
116 static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4) | |
117 { | |
118 UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) | |
119 + ((l2 & ~BYTE_VEC(0x03)) >> 2) | |
120 + ((l3 & ~BYTE_VEC(0x03)) >> 2) | |
121 + ((l4 & ~BYTE_VEC(0x03)) >> 2); | |
122 UINT64 r2 = (( (l1 & BYTE_VEC(0x03)) | |
123 + (l2 & BYTE_VEC(0x03)) | |
124 + (l3 & BYTE_VEC(0x03)) | |
125 + (l4 & BYTE_VEC(0x03)) | |
126 + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03); | |
127 return r1 + r2; | |
128 } | |
129 | |
130 #define PIXOPNAME(suffix) put ## suffix | |
131 #define BTYPE UINT8 | |
132 #define AVG2 avg2 | |
133 #define AVG4 avg4 | |
134 #define STORE(l, b) stq(l, b) | |
135 #include "pixops.h" | |
136 #undef PIXOPNAME | |
137 #undef BTYPE | |
138 #undef AVG2 | |
139 #undef AVG4 | |
140 #undef STORE | |
141 | |
142 #define PIXOPNAME(suffix) put_no_rnd ## suffix | |
143 #define BTYPE UINT8 | |
144 #define AVG2 avg2_no_rnd | |
145 #define AVG4 avg4_no_rnd | |
146 #define STORE(l, b) stq(l, b) | |
147 #include "pixops.h" | |
148 #undef PIXOPNAME | |
149 #undef BTYPE | |
150 #undef AVG2 | |
151 #undef AVG4 | |
152 #undef STORE | |
153 | |
154 /* The following functions are untested. */ | |
155 #if 0 | |
156 | |
157 #define PIXOPNAME(suffix) avg ## suffix | |
158 #define BTYPE UINT8 | |
159 #define AVG2 avg2 | |
160 #define AVG4 avg4 | |
161 #define STORE(l, b) stq(AVG2(l, ldq(b)), b); | |
162 #include "pixops.h" | |
163 #undef PIXOPNAME | |
164 #undef BTYPE | |
165 #undef AVG2 | |
166 #undef AVG4 | |
167 #undef STORE | |
168 | |
169 #define PIXOPNAME(suffix) avg_no_rnd ## suffix | |
170 #define BTYPE UINT8 | |
171 #define AVG2 avg2_no_rnd | |
172 #define AVG4 avg4_no_rnd | |
173 #define STORE(l, b) stq(AVG2(l, ldq(b)), b); | |
174 #include "pixops.h" | |
175 #undef PIXOPNAME | |
176 #undef BTYPE | |
177 #undef AVG2 | |
178 #undef AVG4 | |
179 #undef STORE | |
180 | |
181 #define PIXOPNAME(suffix) sub ## suffix | |
182 #define BTYPE DCTELEM | |
183 #define AVG2 avg2 | |
184 #define AVG4 avg4 | |
185 #define STORE(l, block) do { \ | |
186 UINT64 xxx = l; \ | |
187 (block)[0] -= (xxx >> 0) & 0xff; \ | |
188 (block)[1] -= (xxx >> 8) & 0xff; \ | |
189 (block)[2] -= (xxx >> 16) & 0xff; \ | |
190 (block)[3] -= (xxx >> 24) & 0xff; \ | |
191 (block)[4] -= (xxx >> 32) & 0xff; \ | |
192 (block)[5] -= (xxx >> 40) & 0xff; \ | |
193 (block)[6] -= (xxx >> 48) & 0xff; \ | |
194 (block)[7] -= (xxx >> 56) & 0xff; \ | |
195 } while (0) | |
196 #include "pixops.h" | |
197 #undef PIXOPNAME | |
198 #undef BTYPE | |
199 #undef AVG2 | |
200 #undef AVG4 | |
201 #undef STORE | |
202 | |
203 #endif | |
204 | |
205 void dsputil_init_alpha(void) | |
206 { | |
207 put_pixels_tab[0] = put_pixels_axp; | |
208 put_pixels_tab[1] = put_pixels_x2_axp; | |
209 put_pixels_tab[2] = put_pixels_y2_axp; | |
210 put_pixels_tab[3] = put_pixels_xy2_axp; | |
211 | |
212 put_no_rnd_pixels_tab[0] = put_pixels_axp; | |
213 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp; | |
214 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp; | |
215 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp; | |
216 | |
217 /* amask clears all bits that correspond to present features. */ | |
218 if (amask(AMASK_MVI) == 0) { | |
219 fprintf(stderr, "MVI extension detected\n"); | |
220 put_pixels_clamped = put_pixels_clamped_axp; | |
221 add_pixels_clamped = add_pixels_clamped_axp; | |
222 } | |
223 } |