Mercurial > libavcodec.hg
comparison i386/dsputil_mmx_avg.h @ 0:986e461dc072 libavcodec
Initial revision
author | glantau |
---|---|
date | Sun, 22 Jul 2001 14:18:56 +0000 |
parents | |
children | 4479bcab253e |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:986e461dc072 |
---|---|
1 /* | |
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2 | |
3 * Copyright (c) 2000, 2001 Gerard Lantau. | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
18 * | |
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 */ | |
21 | |
22 static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
23 { | |
24 int dh, hh; | |
25 UINT8 *p; | |
26 const UINT8 *pix; | |
27 p = block; | |
28 pix = pixels; | |
29 hh=h>>2; | |
30 dh=h&3; | |
31 while(hh--) { | |
32 __asm __volatile( | |
33 "movq %4, %%mm0\n\t" | |
34 "movq 1%4, %%mm1\n\t" | |
35 "movq %5, %%mm2\n\t" | |
36 "movq 1%5, %%mm3\n\t" | |
37 "movq %6, %%mm4\n\t" | |
38 "movq 1%6, %%mm5\n\t" | |
39 "movq %7, %%mm6\n\t" | |
40 "movq 1%7, %%mm7\n\t" | |
41 PAVGB" %%mm1, %%mm0\n\t" | |
42 PAVGB" %%mm3, %%mm2\n\t" | |
43 PAVGB" %%mm5, %%mm4\n\t" | |
44 PAVGB" %%mm7, %%mm6\n\t" | |
45 "movq %%mm0, %0\n\t" | |
46 "movq %%mm2, %1\n\t" | |
47 "movq %%mm4, %2\n\t" | |
48 "movq %%mm6, %3\n\t" | |
49 :"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3)) | |
50 :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3)) | |
51 :"memory"); | |
52 pix += line_size*4; p += line_size*4; | |
53 } | |
54 while(dh--) { | |
55 __asm __volatile( | |
56 "movq %1, %%mm0\n\t" | |
57 "movq 1%1, %%mm1\n\t" | |
58 PAVGB" %%mm1, %%mm0\n\t" | |
59 "movq %%mm0, %0\n\t" | |
60 :"=m"(*p) | |
61 :"m"(*pix) | |
62 :"memory"); | |
63 pix += line_size; p += line_size; | |
64 } | |
65 emms(); | |
66 } | |
67 | |
68 static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
69 { | |
70 int dh, hh; | |
71 UINT8 *p; | |
72 const UINT8 *pix; | |
73 p = block; | |
74 pix = pixels; | |
75 | |
76 hh=h>>1; | |
77 dh=h&1; | |
78 while(hh--) { | |
79 __asm __volatile( | |
80 "movq %2, %%mm0\n\t" | |
81 "movq %3, %%mm1\n\t" | |
82 "movq %4, %%mm2\n\t" | |
83 PAVGB" %%mm1, %%mm0\n\t" | |
84 PAVGB" %%mm2, %%mm1\n\t" | |
85 "movq %%mm0, %0\n\t" | |
86 "movq %%mm1, %1\n\t" | |
87 :"=m"(*p), "=m"(*(p+line_size)) | |
88 :"m"(*pix), "m"(*(pix+line_size)), | |
89 "m"(*(pix+line_size*2)) | |
90 :"memory"); | |
91 pix += line_size*2; | |
92 p += line_size*2; | |
93 } | |
94 if(dh) { | |
95 __asm __volatile( | |
96 "movq %1, %%mm0\n\t" | |
97 "movq %2, %%mm1\n\t" | |
98 PAVGB" %%mm1, %%mm0\n\t" | |
99 "movq %%mm0, %0\n\t" | |
100 :"=m"(*p) | |
101 :"m"(*pix), | |
102 "m"(*(pix+line_size)) | |
103 :"memory"); | |
104 } | |
105 emms(); | |
106 } | |
107 | |
108 static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
109 { | |
110 int dh, hh; | |
111 UINT8 *p; | |
112 const UINT8 *pix; | |
113 p = block; | |
114 pix = pixels; | |
115 hh=h>>2; | |
116 dh=h&3; | |
117 while(hh--) { | |
118 __asm __volatile( | |
119 "movq %0, %%mm0\n\t" | |
120 "movq %4, %%mm1\n\t" | |
121 "movq %1, %%mm2\n\t" | |
122 "movq %5, %%mm3\n\t" | |
123 "movq %2, %%mm4\n\t" | |
124 "movq %6, %%mm5\n\t" | |
125 "movq %3, %%mm6\n\t" | |
126 "movq %7, %%mm7\n\t" | |
127 PAVGB" %%mm1, %%mm0\n\t" | |
128 PAVGB" %%mm3, %%mm2\n\t" | |
129 PAVGB" %%mm5, %%mm4\n\t" | |
130 PAVGB" %%mm7, %%mm6\n\t" | |
131 "movq %%mm0, %0\n\t" | |
132 "movq %%mm2, %1\n\t" | |
133 "movq %%mm4, %2\n\t" | |
134 "movq %%mm6, %3\n\t" | |
135 :"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3)) | |
136 :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3)) | |
137 :"memory"); | |
138 pix += line_size*4; p += line_size*4; | |
139 } | |
140 while(dh--) { | |
141 __asm __volatile( | |
142 "movq %0, %%mm0\n\t" | |
143 "movq %1, %%mm1\n\t" | |
144 PAVGB" %%mm1, %%mm0\n\t" | |
145 "movq %%mm0, %0\n\t" | |
146 :"=m"(*p) | |
147 :"m"(*pix) | |
148 :"memory"); | |
149 pix += line_size; p += line_size; | |
150 } | |
151 emms(); | |
152 } | |
153 | |
154 static void DEF(avg_pixels_x2)( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
155 { | |
156 int dh, hh; | |
157 UINT8 *p; | |
158 const UINT8 *pix; | |
159 p = block; | |
160 pix = pixels; | |
161 hh=h>>1; | |
162 dh=h&1; | |
163 while(hh--) { | |
164 __asm __volatile( | |
165 "movq %2, %%mm2\n\t" | |
166 "movq 1%2, %%mm3\n\t" | |
167 "movq %3, %%mm4\n\t" | |
168 "movq 1%3, %%mm5\n\t" | |
169 "movq %0, %%mm0\n\t" | |
170 "movq %1, %%mm1\n\t" | |
171 PAVGB" %%mm3, %%mm2\n\t" | |
172 PAVGB" %%mm2, %%mm0\n\t" | |
173 PAVGB" %%mm5, %%mm4\n\t" | |
174 PAVGB" %%mm4, %%mm1\n\t" | |
175 "movq %%mm0, %0\n\t" | |
176 "movq %%mm1, %1\n\t" | |
177 :"=m"(*p), "=m"(*(p+line_size)) | |
178 :"m"(*pix), "m"(*(pix+line_size)) | |
179 :"memory"); | |
180 pix += line_size*2; | |
181 p += line_size*2; | |
182 } | |
183 if(dh) { | |
184 __asm __volatile( | |
185 "movq %1, %%mm1\n\t" | |
186 "movq 1%1, %%mm2\n\t" | |
187 "movq %0, %%mm0\n\t" | |
188 PAVGB" %%mm2, %%mm1\n\t" | |
189 PAVGB" %%mm1, %%mm0\n\t" | |
190 "movq %%mm0, %0\n\t" | |
191 :"=m"(*p) | |
192 :"m"(*pix) | |
193 :"memory"); | |
194 } | |
195 emms(); | |
196 } | |
197 | |
198 static void DEF(avg_pixels_y2)( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
199 { | |
200 int dh, hh; | |
201 UINT8 *p; | |
202 const UINT8 *pix; | |
203 p = block; | |
204 pix = pixels; | |
205 hh=h>>1; | |
206 dh=h&1; | |
207 while(hh--) { | |
208 __asm __volatile( | |
209 "movq %2, %%mm2\n\t" | |
210 "movq %3, %%mm3\n\t" | |
211 "movq %3, %%mm4\n\t" | |
212 "movq %4, %%mm5\n\t" | |
213 "movq %0, %%mm0\n\t" | |
214 "movq %1, %%mm1\n\t" | |
215 PAVGB" %%mm3, %%mm2\n\t" | |
216 PAVGB" %%mm2, %%mm0\n\t" | |
217 PAVGB" %%mm5, %%mm4\n\t" | |
218 PAVGB" %%mm4, %%mm1\n\t" | |
219 "movq %%mm0, %0\n\t" | |
220 "movq %%mm1, %1\n\t" | |
221 :"=m"(*p), "=m"(*(p+line_size)) | |
222 :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)) | |
223 :"memory"); | |
224 pix += line_size*2; | |
225 p += line_size*2; | |
226 } | |
227 if(dh) { | |
228 __asm __volatile( | |
229 "movq %1, %%mm1\n\t" | |
230 "movq %2, %%mm2\n\t" | |
231 "movq %0, %%mm0\n\t" | |
232 PAVGB" %%mm2, %%mm1\n\t" | |
233 PAVGB" %%mm1, %%mm0\n\t" | |
234 "movq %%mm0, %0\n\t" | |
235 :"=m"(*p) | |
236 :"m"(*pix), "m"(*(pix+line_size)) | |
237 :"memory"); | |
238 } | |
239 emms(); | |
240 } | |
241 | |
242 static void DEF(avg_pixels_xy2)( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
243 { | |
244 UINT8 *p; | |
245 const UINT8 *pix; | |
246 p = block; | |
247 pix = pixels; | |
248 __asm __volatile( | |
249 "pxor %%mm7, %%mm7\n\t" | |
250 "movq %0, %%mm6\n\t" | |
251 ::"m"(mm_wtwo[0]):"memory"); | |
252 do { | |
253 __asm __volatile( | |
254 "movq %1, %%mm0\n\t" | |
255 "movq %2, %%mm1\n\t" | |
256 "movq 1%1, %%mm4\n\t" | |
257 "movq 1%2, %%mm5\n\t" | |
258 "movq %%mm0, %%mm2\n\t" | |
259 "movq %%mm1, %%mm3\n\t" | |
260 "punpcklbw %%mm7, %%mm0\n\t" | |
261 "punpcklbw %%mm7, %%mm1\n\t" | |
262 "punpckhbw %%mm7, %%mm2\n\t" | |
263 "punpckhbw %%mm7, %%mm3\n\t" | |
264 "paddusw %%mm1, %%mm0\n\t" | |
265 "paddusw %%mm3, %%mm2\n\t" | |
266 "movq %%mm4, %%mm1\n\t" | |
267 "movq %%mm5, %%mm3\n\t" | |
268 "punpcklbw %%mm7, %%mm4\n\t" | |
269 "punpcklbw %%mm7, %%mm5\n\t" | |
270 "punpckhbw %%mm7, %%mm1\n\t" | |
271 "punpckhbw %%mm7, %%mm3\n\t" | |
272 "paddusw %%mm5, %%mm4\n\t" | |
273 "paddusw %%mm3, %%mm1\n\t" | |
274 "paddusw %%mm6, %%mm4\n\t" | |
275 "paddusw %%mm6, %%mm1\n\t" | |
276 "paddusw %%mm4, %%mm0\n\t" | |
277 "paddusw %%mm1, %%mm2\n\t" | |
278 "psrlw $2, %%mm0\n\t" | |
279 "psrlw $2, %%mm2\n\t" | |
280 "packuswb %%mm2, %%mm0\n\t" | |
281 PAVGB" %0, %%mm0\n\t" | |
282 "movq %%mm0, %0\n\t" | |
283 :"=m"(*p) | |
284 :"m"(*pix), | |
285 "m"(*(pix+line_size)) | |
286 :"memory"); | |
287 pix += line_size; | |
288 p += line_size ; | |
289 } while(--h); | |
290 emms(); | |
291 } | |
292 | |
293 static void DEF(sub_pixels_x2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
294 { | |
295 DCTELEM *p; | |
296 const UINT8 *pix; | |
297 p = block; | |
298 pix = pixels; | |
299 __asm __volatile( | |
300 "pxor %%mm7, %%mm7":::"memory"); | |
301 do { | |
302 __asm __volatile( | |
303 "movq 1%1, %%mm2\n\t" | |
304 "movq %0, %%mm0\n\t" | |
305 PAVGB" %1, %%mm2\n\t" | |
306 "movq 8%0, %%mm1\n\t" | |
307 "movq %%mm2, %%mm3\n\t" | |
308 "punpcklbw %%mm7, %%mm2\n\t" | |
309 "punpckhbw %%mm7, %%mm3\n\t" | |
310 "psubsw %%mm2, %%mm0\n\t" | |
311 "psubsw %%mm3, %%mm1\n\t" | |
312 "movq %%mm0, %0\n\t" | |
313 "movq %%mm1, 8%0\n\t" | |
314 :"=m"(*p) | |
315 :"m"(*pix) | |
316 :"memory"); | |
317 pix += line_size; | |
318 p += 8; | |
319 } while (--h); | |
320 emms(); | |
321 } | |
322 | |
323 static void DEF(sub_pixels_y2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
324 { | |
325 DCTELEM *p; | |
326 const UINT8 *pix; | |
327 p = block; | |
328 pix = pixels; | |
329 __asm __volatile( | |
330 "pxor %%mm7, %%mm7":::"memory"); | |
331 do { | |
332 __asm __volatile( | |
333 "movq %2, %%mm2\n\t" | |
334 "movq %0, %%mm0\n\t" | |
335 PAVGB" %1, %%mm2\n\t" | |
336 "movq 8%0, %%mm1\n\t" | |
337 "movq %%mm2, %%mm3\n\t" | |
338 "punpcklbw %%mm7, %%mm2\n\t" | |
339 "punpckhbw %%mm7, %%mm3\n\t" | |
340 "psubsw %%mm2, %%mm0\n\t" | |
341 "psubsw %%mm3, %%mm1\n\t" | |
342 "movq %%mm0, %0\n\t" | |
343 "movq %%mm1, 8%0\n\t" | |
344 :"=m"(*p) | |
345 :"m"(*pix), "m"(*(pix+line_size)) | |
346 :"memory"); | |
347 pix += line_size; | |
348 p += 8; | |
349 } while (--h); | |
350 emms(); | |
351 } | |
352 |