comparison alpha/dsputil_alpha.c @ 214:73df666cacc7 libavcodec

Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
author nickols_k
date Sun, 20 Jan 2002 14:48:02 +0000
parents
children 718a22dc121f
comparison
equal deleted inserted replaced
213:e80ad397d30e 214:73df666cacc7
1 /*
2 * Alpha optimized DSP utils
3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20 #include "asm.h"
21 #include "../dsputil.h"
22
23 void simple_idct_axp(DCTELEM *block);
24
25 static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels,
26 int line_size)
27 {
28 int i = 8;
29 do {
30 UINT64 shorts;
31
32 shorts = ldq(block);
33 shorts = maxsw4(shorts, 0);
34 shorts = minsw4(shorts, WORD_VEC(0x00ff));
35 stl(pkwb(shorts), pixels);
36
37 shorts = ldq(block + 4);
38 shorts = maxsw4(shorts, 0);
39 shorts = minsw4(shorts, WORD_VEC(0x00ff));
40 stl(pkwb(shorts), pixels + 4);
41
42 pixels += line_size;
43 block += 8;
44 } while (--i);
45 }
46
47 static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels,
48 int line_size)
49 {
50 int i = 8;
51 do {
52 UINT64 shorts;
53
54 shorts = ldq(block);
55 shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */
56 shorts += unpkbw(ldl(pixels));
57 shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */
58 shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */
59 shorts &= ~WORD_VEC(0x4000); /* ...and zap them */
60 shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */
61 stl(pkwb(shorts), pixels);
62
63 /* next 4 */
64 shorts = ldq(block + 4);
65 shorts &= ~WORD_VEC(0x8000);
66 shorts += unpkbw(ldl(pixels + 4));
67 shorts &= ~WORD_VEC(0x8000);
68 shorts = minuw4(shorts, WORD_VEC(0x4000));
69 shorts &= ~WORD_VEC(0x4000);
70 shorts = minsw4(shorts, WORD_VEC(0x00ff));
71 stl(pkwb(shorts), pixels + 4);
72
73 pixels += line_size;
74 block += 8;
75 } while (--i);
76 }
77
78 /* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
79 Since the immediate result could be greater than 255, we do the
80 shift first. The result is too low by one if the bytes were both
81 odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */
82 static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2)
83 {
84 UINT64 correction = (l1 & l2) & BYTE_VEC(0x01);
85 l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
86 l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
87 return l1 + l2 + correction;
88 }
89
90 /* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1
91 The '1' only has an effect when one byte is even and the other odd,
92 i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01).
93 Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01). */
94 static inline UINT64 avg2(UINT64 l1, UINT64 l2)
95 {
96 UINT64 correction = (l1 | l2) & BYTE_VEC(0x01);
97 l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
98 l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
99 return l1 + l2 + correction;
100 }
101
102 static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
103 {
104 UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
105 + ((l2 & ~BYTE_VEC(0x03)) >> 2)
106 + ((l3 & ~BYTE_VEC(0x03)) >> 2)
107 + ((l4 & ~BYTE_VEC(0x03)) >> 2);
108 UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
109 + (l2 & BYTE_VEC(0x03))
110 + (l3 & BYTE_VEC(0x03))
111 + (l4 & BYTE_VEC(0x03))
112 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
113 return r1 + r2;
114 }
115
116 static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
117 {
118 UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
119 + ((l2 & ~BYTE_VEC(0x03)) >> 2)
120 + ((l3 & ~BYTE_VEC(0x03)) >> 2)
121 + ((l4 & ~BYTE_VEC(0x03)) >> 2);
122 UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
123 + (l2 & BYTE_VEC(0x03))
124 + (l3 & BYTE_VEC(0x03))
125 + (l4 & BYTE_VEC(0x03))
126 + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03);
127 return r1 + r2;
128 }
129
130 #define PIXOPNAME(suffix) put ## suffix
131 #define BTYPE UINT8
132 #define AVG2 avg2
133 #define AVG4 avg4
134 #define STORE(l, b) stq(l, b)
135 #include "pixops.h"
136 #undef PIXOPNAME
137 #undef BTYPE
138 #undef AVG2
139 #undef AVG4
140 #undef STORE
141
142 #define PIXOPNAME(suffix) put_no_rnd ## suffix
143 #define BTYPE UINT8
144 #define AVG2 avg2_no_rnd
145 #define AVG4 avg4_no_rnd
146 #define STORE(l, b) stq(l, b)
147 #include "pixops.h"
148 #undef PIXOPNAME
149 #undef BTYPE
150 #undef AVG2
151 #undef AVG4
152 #undef STORE
153
154 /* The following functions are untested. */
155 #if 0
156
157 #define PIXOPNAME(suffix) avg ## suffix
158 #define BTYPE UINT8
159 #define AVG2 avg2
160 #define AVG4 avg4
161 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
162 #include "pixops.h"
163 #undef PIXOPNAME
164 #undef BTYPE
165 #undef AVG2
166 #undef AVG4
167 #undef STORE
168
169 #define PIXOPNAME(suffix) avg_no_rnd ## suffix
170 #define BTYPE UINT8
171 #define AVG2 avg2_no_rnd
172 #define AVG4 avg4_no_rnd
173 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
174 #include "pixops.h"
175 #undef PIXOPNAME
176 #undef BTYPE
177 #undef AVG2
178 #undef AVG4
179 #undef STORE
180
181 #define PIXOPNAME(suffix) sub ## suffix
182 #define BTYPE DCTELEM
183 #define AVG2 avg2
184 #define AVG4 avg4
185 #define STORE(l, block) do { \
186 UINT64 xxx = l; \
187 (block)[0] -= (xxx >> 0) & 0xff; \
188 (block)[1] -= (xxx >> 8) & 0xff; \
189 (block)[2] -= (xxx >> 16) & 0xff; \
190 (block)[3] -= (xxx >> 24) & 0xff; \
191 (block)[4] -= (xxx >> 32) & 0xff; \
192 (block)[5] -= (xxx >> 40) & 0xff; \
193 (block)[6] -= (xxx >> 48) & 0xff; \
194 (block)[7] -= (xxx >> 56) & 0xff; \
195 } while (0)
196 #include "pixops.h"
197 #undef PIXOPNAME
198 #undef BTYPE
199 #undef AVG2
200 #undef AVG4
201 #undef STORE
202
203 #endif
204
205 void dsputil_init_alpha(void)
206 {
207 put_pixels_tab[0] = put_pixels_axp;
208 put_pixels_tab[1] = put_pixels_x2_axp;
209 put_pixels_tab[2] = put_pixels_y2_axp;
210 put_pixels_tab[3] = put_pixels_xy2_axp;
211
212 put_no_rnd_pixels_tab[0] = put_pixels_axp;
213 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
214 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
215 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
216
217 /* amask clears all bits that correspond to present features. */
218 if (amask(AMASK_MVI) == 0) {
219 fprintf(stderr, "MVI extension detected\n");
220 put_pixels_clamped = put_pixels_clamped_axp;
221 add_pixels_clamped = add_pixels_clamped_axp;
222 }
223 }