0
|
1 /*
|
|
2 * MMX optimized DSP utils
|
|
3 * Copyright (c) 2000, 2001 Gerard Lantau.
|
|
4 *
|
|
5 * This program is free software; you can redistribute it and/or modify
|
|
6 * it under the terms of the GNU General Public License as published by
|
|
7 * the Free Software Foundation; either version 2 of the License, or
|
|
8 * (at your option) any later version.
|
|
9 *
|
|
10 * This program is distributed in the hope that it will be useful,
|
|
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 * GNU General Public License for more details.
|
|
14 *
|
|
15 * You should have received a copy of the GNU General Public License
|
|
16 * along with this program; if not, write to the Free Software
|
|
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
18 *
|
|
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
|
20 */
|
|
21
|
|
22 #include "../dsputil.h"
|
|
23
|
5
|
24 int mm_flags; /* multimedia extension flags */
|
|
25
|
0
|
26 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
|
|
27 int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h);
|
|
28 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
|
|
29 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
|
|
30 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
|
|
31
|
|
32 /* pixel operations */
|
8
|
33 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001;
|
|
34 static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002;
|
|
35 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
|
|
36 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
|
0
|
37
|
|
38 /***********************************/
|
|
39 /* 3Dnow specific */
|
|
40
|
|
41 #define DEF(x) x ## _3dnow
|
|
42 /* for Athlons PAVGUSB is prefered */
|
|
43 #define PAVGB "pavgusb"
|
|
44
|
|
45 #include "dsputil_mmx_avg.h"
|
|
46
|
|
47 #undef DEF
|
|
48 #undef PAVGB
|
|
49
|
|
50 /***********************************/
|
|
51 /* MMX2 specific */
|
|
52
|
|
53 #define DEF(x) x ## _sse
|
|
54
|
|
55 /* Introduced only in MMX2 set */
|
|
56 #define PAVGB "pavgb"
|
|
57
|
|
58 #include "dsputil_mmx_avg.h"
|
|
59
|
|
60 #undef DEF
|
|
61 #undef PAVGB
|
|
62
|
|
63 /***********************************/
|
|
64 /* standard MMX */
|
|
65
|
|
66 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
|
|
67 {
|
|
68 DCTELEM *p;
|
|
69 const UINT8 *pix;
|
|
70 int i;
|
|
71
|
|
72 /* read the pixels */
|
|
73 p = block;
|
|
74 pix = pixels;
|
|
75 __asm __volatile("pxor %%mm7, %%mm7":::"memory");
|
|
76 for(i=0;i<4;i++) {
|
|
77 __asm __volatile(
|
|
78 "movq %1, %%mm0\n\t"
|
|
79 "movq %2, %%mm1\n\t"
|
|
80 "movq %%mm0, %%mm2\n\t"
|
|
81 "movq %%mm1, %%mm3\n\t"
|
|
82 "punpcklbw %%mm7, %%mm0\n\t"
|
|
83 "punpckhbw %%mm7, %%mm2\n\t"
|
|
84 "punpcklbw %%mm7, %%mm1\n\t"
|
|
85 "punpckhbw %%mm7, %%mm3\n\t"
|
|
86 "movq %%mm0, %0\n\t"
|
|
87 "movq %%mm2, 8%0\n\t"
|
|
88 "movq %%mm1, 16%0\n\t"
|
|
89 "movq %%mm3, 24%0\n\t"
|
|
90 :"=m"(*p)
|
|
91 :"m"(*pix), "m"(*(pix+line_size))
|
|
92 :"memory");
|
|
93 pix += line_size*2;
|
|
94 p += 16;
|
|
95 }
|
|
96 }
|
|
97
|
|
98 static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
|
|
99 {
|
|
100 const DCTELEM *p;
|
|
101 UINT8 *pix;
|
|
102 int i;
|
|
103
|
|
104 /* read the pixels */
|
|
105 p = block;
|
|
106 pix = pixels;
|
|
107 for(i=0;i<2;i++) {
|
|
108 __asm __volatile(
|
|
109 "movq %4, %%mm0\n\t"
|
|
110 "movq 8%4, %%mm1\n\t"
|
|
111 "movq 16%4, %%mm2\n\t"
|
|
112 "movq 24%4, %%mm3\n\t"
|
|
113 "movq 32%4, %%mm4\n\t"
|
|
114 "movq 40%4, %%mm5\n\t"
|
|
115 "movq 48%4, %%mm6\n\t"
|
|
116 "movq 56%4, %%mm7\n\t"
|
|
117 "packuswb %%mm1, %%mm0\n\t"
|
|
118 "packuswb %%mm3, %%mm2\n\t"
|
|
119 "packuswb %%mm5, %%mm4\n\t"
|
|
120 "packuswb %%mm7, %%mm6\n\t"
|
|
121 "movq %%mm0, %0\n\t"
|
|
122 "movq %%mm2, %1\n\t"
|
|
123 "movq %%mm4, %2\n\t"
|
|
124 "movq %%mm6, %3\n\t"
|
|
125 :"=m"(*pix), "=m"(*(pix+line_size))
|
|
126 ,"=m"(*(pix+line_size*2)), "=m"(*(pix+line_size*3))
|
|
127 :"m"(*p)
|
|
128 :"memory");
|
|
129 pix += line_size*4;
|
|
130 p += 32;
|
|
131 }
|
|
132 }
|
|
133
|
|
134 static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
|
|
135 {
|
|
136 const DCTELEM *p;
|
|
137 UINT8 *pix;
|
|
138 int i;
|
|
139
|
|
140 /* read the pixels */
|
|
141 p = block;
|
|
142 pix = pixels;
|
|
143 __asm __volatile("pxor %%mm7, %%mm7":::"memory");
|
|
144 for(i=0;i<4;i++) {
|
|
145 __asm __volatile(
|
|
146 "movq %2, %%mm0\n\t"
|
|
147 "movq 8%2, %%mm1\n\t"
|
|
148 "movq 16%2, %%mm2\n\t"
|
|
149 "movq 24%2, %%mm3\n\t"
|
|
150 "movq %0, %%mm4\n\t"
|
|
151 "movq %1, %%mm6\n\t"
|
|
152 "movq %%mm4, %%mm5\n\t"
|
|
153 "punpcklbw %%mm7, %%mm4\n\t"
|
|
154 "punpckhbw %%mm7, %%mm5\n\t"
|
|
155 "paddsw %%mm4, %%mm0\n\t"
|
|
156 "paddsw %%mm5, %%mm1\n\t"
|
|
157 "movq %%mm6, %%mm5\n\t"
|
|
158 "punpcklbw %%mm7, %%mm6\n\t"
|
|
159 "punpckhbw %%mm7, %%mm5\n\t"
|
|
160 "paddsw %%mm6, %%mm2\n\t"
|
|
161 "paddsw %%mm5, %%mm3\n\t"
|
|
162 "packuswb %%mm1, %%mm0\n\t"
|
|
163 "packuswb %%mm3, %%mm2\n\t"
|
|
164 "movq %%mm0, %0\n\t"
|
|
165 "movq %%mm2, %1\n\t"
|
|
166 :"=m"(*pix), "=m"(*(pix+line_size))
|
|
167 :"m"(*p)
|
|
168 :"memory");
|
|
169 pix += line_size*2;
|
|
170 p += 16;
|
|
171 }
|
|
172 }
|
|
173
|
|
174 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
|
|
175 {
|
|
176 int dh, hh;
|
|
177 UINT8 *p;
|
|
178 const UINT8 *pix;
|
|
179 p = block;
|
|
180 pix = pixels;
|
|
181 hh=h>>2;
|
|
182 dh=h&3;
|
|
183 while(hh--) {
|
|
184 __asm __volatile(
|
|
185 "movq %4, %%mm0\n\t"
|
|
186 "movq %5, %%mm1\n\t"
|
|
187 "movq %6, %%mm2\n\t"
|
|
188 "movq %7, %%mm3\n\t"
|
|
189 "movq %%mm0, %0\n\t"
|
|
190 "movq %%mm1, %1\n\t"
|
|
191 "movq %%mm2, %2\n\t"
|
|
192 "movq %%mm3, %3\n\t"
|
|
193 :"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3))
|
|
194 :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3))
|
|
195 :"memory");
|
|
196 pix = pix + line_size*4;
|
|
197 p = p + line_size*4;
|
|
198 }
|
|
199 while(dh--) {
|
|
200 __asm __volatile(
|
|
201 "movq %1, %%mm0\n\t"
|
|
202 "movq %%mm0, %0\n\t"
|
|
203 :"=m"(*p)
|
|
204 :"m"(*pix)
|
|
205 :"memory");
|
|
206 pix = pix + line_size;
|
|
207 p = p + line_size;
|
|
208 }
|
|
209 }
|
|
210
|
|
211 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
|
|
212 {
|
|
213 UINT8 *p;
|
|
214 const UINT8 *pix;
|
|
215 p = block;
|
|
216 pix = pixels;
|
|
217 __asm __volatile(
|
|
218 "pxor %%mm7, %%mm7\n\t"
|
|
219 "movq %0, %%mm4\n\t"
|
8
|
220 ::"m"(mm_wone):"memory");
|
0
|
221 do {
|
|
222 __asm __volatile(
|
|
223 "movq %1, %%mm0\n\t"
|
|
224 "movq 1%1, %%mm1\n\t"
|
|
225 "movq %%mm0, %%mm2\n\t"
|
|
226 "movq %%mm1, %%mm3\n\t"
|
|
227 "punpcklbw %%mm7, %%mm0\n\t"
|
|
228 "punpcklbw %%mm7, %%mm1\n\t"
|
|
229 "punpckhbw %%mm7, %%mm2\n\t"
|
|
230 "punpckhbw %%mm7, %%mm3\n\t"
|
|
231 "paddusw %%mm1, %%mm0\n\t"
|
|
232 "paddusw %%mm3, %%mm2\n\t"
|
|
233 "paddusw %%mm4, %%mm0\n\t"
|
|
234 "paddusw %%mm4, %%mm2\n\t"
|
|
235 "psrlw $1, %%mm0\n\t"
|
|
236 "psrlw $1, %%mm2\n\t"
|
|
237 "packuswb %%mm2, %%mm0\n\t"
|
|
238 "movq %%mm0, %0\n\t"
|
|
239 :"=m"(*p)
|
|
240 :"m"(*pix)
|
|
241 :"memory");
|
|
242 pix += line_size; p += line_size;
|
|
243 } while (--h);
|
|
244 }
|
|
245
|
|
246 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
|
|
247 {
|
|
248 UINT8 *p;
|
|
249 const UINT8 *pix;
|
|
250 p = block;
|
|
251 pix = pixels;
|
|
252 __asm __volatile(
|
|
253 "pxor %%mm7, %%mm7\n\t"
|
|
254 "movq %0, %%mm4\n\t"
|
8
|
255 ::"m"(mm_wone):"memory");
|
0
|
256 do {
|
|
257 __asm __volatile(
|
|
258 "movq %1, %%mm0\n\t"
|
|
259 "movq %2, %%mm1\n\t"
|
|
260 "movq %%mm0, %%mm2\n\t"
|
|
261 "movq %%mm1, %%mm3\n\t"
|
|
262 "punpcklbw %%mm7, %%mm0\n\t"
|
|
263 "punpcklbw %%mm7, %%mm1\n\t"
|
|
264 "punpckhbw %%mm7, %%mm2\n\t"
|
|
265 "punpckhbw %%mm7, %%mm3\n\t"
|
|
266 "paddusw %%mm1, %%mm0\n\t"
|
|
267 "paddusw %%mm3, %%mm2\n\t"
|
|
268 "paddusw %%mm4, %%mm0\n\t"
|
|
269 "paddusw %%mm4, %%mm2\n\t"
|
|
270 "psrlw $1, %%mm0\n\t"
|
|
271 "psrlw $1, %%mm2\n\t"
|
|
272 "packuswb %%mm2, %%mm0\n\t"
|
|
273 "movq %%mm0, %0\n\t"
|
|
274 :"=m"(*p)
|
|
275 :"m"(*pix),
|
|
276 "m"(*(pix+line_size))
|
|
277 :"memory");
|
|
278 pix += line_size;
|
|
279 p += line_size;
|
|
280 } while (--h);
|
|
281 }
|
|
282
|
|
283 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
|
|
284 {
|
|
285 UINT8 *p;
|
|
286 const UINT8 *pix;
|
|
287 p = block;
|
|
288 pix = pixels;
|
|
289 __asm __volatile(
|
|
290 "pxor %%mm7, %%mm7\n\t"
|
|
291 "movq %0, %%mm6\n\t"
|
8
|
292 ::"m"(mm_wtwo):"memory");
|
0
|
293 do {
|
|
294 __asm __volatile(
|
|
295 "movq %1, %%mm0\n\t"
|
|
296 "movq %2, %%mm1\n\t"
|
|
297 "movq 1%1, %%mm4\n\t"
|
|
298 "movq 1%2, %%mm5\n\t"
|
|
299 "movq %%mm0, %%mm2\n\t"
|
|
300 "movq %%mm1, %%mm3\n\t"
|
|
301 "punpcklbw %%mm7, %%mm0\n\t"
|
|
302 "punpcklbw %%mm7, %%mm1\n\t"
|
|
303 "punpckhbw %%mm7, %%mm2\n\t"
|
|
304 "punpckhbw %%mm7, %%mm3\n\t"
|
|
305 "paddusw %%mm1, %%mm0\n\t"
|
|
306 "paddusw %%mm3, %%mm2\n\t"
|
|
307 "movq %%mm4, %%mm1\n\t"
|
|
308 "movq %%mm5, %%mm3\n\t"
|
|
309 "punpcklbw %%mm7, %%mm4\n\t"
|
|
310 "punpcklbw %%mm7, %%mm5\n\t"
|
|
311 "punpckhbw %%mm7, %%mm1\n\t"
|
|
312 "punpckhbw %%mm7, %%mm3\n\t"
|
|
313 "paddusw %%mm5, %%mm4\n\t"
|
|
314 "paddusw %%mm3, %%mm1\n\t"
|
|
315 "paddusw %%mm6, %%mm4\n\t"
|
|
316 "paddusw %%mm6, %%mm1\n\t"
|
|
317 "paddusw %%mm4, %%mm0\n\t"
|
|
318 "paddusw %%mm1, %%mm2\n\t"
|
|
319 "psrlw $2, %%mm0\n\t"
|
|
320 "psrlw $2, %%mm2\n\t"
|
|
321 "packuswb %%mm2, %%mm0\n\t"
|
|
322 "movq %%mm0, %0\n\t"
|
|
323 :"=m"(*p)
|
|
324 :"m"(*pix),
|
|
325 "m"(*(pix+line_size))
|
|
326 :"memory");
|
|
327 pix += line_size;
|
|
328 p += line_size;
|
|
329 } while(--h);
|
|
330 }
|
|
331
|
|
332 static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
|
|
333 {
|
|
334 UINT8 *p;
|
|
335 const UINT8 *pix;
|
|
336 p = block;
|
|
337 pix = pixels;
|
|
338 __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory");
|
|
339 do {
|
|
340 __asm __volatile(
|
|
341 "movq %1, %%mm0\n\t"
|
|
342 "movq 1%1, %%mm1\n\t"
|
|
343 "movq %%mm0, %%mm2\n\t"
|
|
344 "movq %%mm1, %%mm3\n\t"
|
|
345 "punpcklbw %%mm7, %%mm0\n\t"
|
|
346 "punpcklbw %%mm7, %%mm1\n\t"
|
|
347 "punpckhbw %%mm7, %%mm2\n\t"
|
|
348 "punpckhbw %%mm7, %%mm3\n\t"
|
|
349 "paddusw %%mm1, %%mm0\n\t"
|
|
350 "paddusw %%mm3, %%mm2\n\t"
|
|
351 "psrlw $1, %%mm0\n\t"
|
|
352 "psrlw $1, %%mm2\n\t"
|
|
353 "packuswb %%mm2, %%mm0\n\t"
|
|
354 "movq %%mm0, %0\n\t"
|
|
355 :"=m"(*p)
|
|
356 :"m"(*pix)
|
|
357 :"memory");
|
|
358 pix += line_size;
|
|
359 p += line_size;
|
|
360 } while (--h);
|
|
361 }
|
|
362
|
|
363 static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
|
|
364 {
|
|
365 UINT8 *p;
|
|
366 const UINT8 *pix;
|
|
367 p = block;
|
|
368 pix = pixels;
|
|
369 __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory");
|
|
370 do {
|
|
371 __asm __volatile(
|
|
372 "movq %1, %%mm0\n\t"
|
|
373 "movq %2, %%mm1\n\t"
|
|
374 "movq %%mm0, %%mm2\n\t"
|
|
375 "movq %%mm1, %%mm3\n\t"
|
|
376 "punpcklbw %%mm7, %%mm0\n\t"
|
|
377 "punpcklbw %%mm7, %%mm1\n\t"
|
|
378 "punpckhbw %%mm7, %%mm2\n\t"
|
|
379 "punpckhbw %%mm7, %%mm3\n\t"
|
|
380 "paddusw %%mm1, %%mm0\n\t"
|
|
381 "paddusw %%mm3, %%mm2\n\t"
|
|
382 "psrlw $1, %%mm0\n\t"
|
|
383 "psrlw $1, %%mm2\n\t"
|
|
384 "packuswb %%mm2, %%mm0\n\t"
|
|
385 "movq %%mm0, %0\n\t"
|
|
386 :"=m"(*p)
|
|
387 :"m"(*pix),
|
|
388 "m"(*(pix+line_size))
|
|
389 :"memory");
|
|
390 pix += line_size;
|
|
391 p += line_size;
|
|
392 } while(--h);
|
|
393 }
|
|
394
|
|
395 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
|
|
396 {
|
|
397 UINT8 *p;
|
|
398 const UINT8 *pix;
|
|
399 p = block;
|
|
400 pix = pixels;
|
|
401 __asm __volatile(
|
|
402 "pxor %%mm7, %%mm7\n\t"
|
|
403 "movq %0, %%mm6\n\t"
|
8
|
404 ::"m"(mm_wone):"memory");
|
0
|
405 do {
|
|
406 __asm __volatile(
|
|
407 "movq %1, %%mm0\n\t"
|
|
408 "movq %2, %%mm1\n\t"
|
|
409 "movq 1%1, %%mm4\n\t"
|
|
410 "movq 1%2, %%mm5\n\t"
|
|
411 "movq %%mm0, %%mm2\n\t"
|
|
412 "movq %%mm1, %%mm3\n\t"
|
|
413 "punpcklbw %%mm7, %%mm0\n\t"
|
|
414 "punpcklbw %%mm7, %%mm1\n\t"
|
|
415 "punpckhbw %%mm7, %%mm2\n\t"
|
|
416 "punpckhbw %%mm7, %%mm3\n\t"
|
|
417 "paddusw %%mm1, %%mm0\n\t"
|
|
418 "paddusw %%mm3, %%mm2\n\t"
|
|
419 "movq %%mm4, %%mm1\n\t"
|
|
420 "movq %%mm5, %%mm3\n\t"
|
|
421 "punpcklbw %%mm7, %%mm4\n\t"
|
|
422 "punpcklbw %%mm7, %%mm5\n\t"
|
|
423 "punpckhbw %%mm7, %%mm1\n\t"
|
|
424 "punpckhbw %%mm7, %%mm3\n\t"
|
|
425 "paddusw %%mm5, %%mm4\n\t"
|
|
426 "paddusw %%mm3, %%mm1\n\t"
|
|
427 "paddusw %%mm6, %%mm4\n\t"
|
|
428 "paddusw %%mm6, %%mm1\n\t"
|
|
429 "paddusw %%mm4, %%mm0\n\t"
|
|
430 "paddusw %%mm1, %%mm2\n\t"
|
|
431 "psrlw $2, %%mm0\n\t"
|
|
432 "psrlw $2, %%mm2\n\t"
|
|
433 "packuswb %%mm2, %%mm0\n\t"
|
|
434 "movq %%mm0, %0\n\t"
|
|
435 :"=m"(*p)
|
|
436 :"m"(*pix),
|
|
437 "m"(*(pix+line_size))
|
|
438 :"memory");
|
|
439 pix += line_size;
|
|
440 p += line_size;
|
|
441 } while(--h);
|
|
442 }
|
|
443
|
|
444 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
|
|
445 {
|
|
446 UINT8 *p;
|
|
447 const UINT8 *pix;
|
|
448 p = block;
|
|
449 pix = pixels;
|
|
450 __asm __volatile(
|
|
451 "pxor %%mm7, %%mm7\n\t"
|
|
452 "movq %0, %%mm6\n\t"
|
8
|
453 ::"m"(mm_wone):"memory");
|
0
|
454 do {
|
|
455 __asm __volatile(
|
|
456 "movq %0, %%mm0\n\t"
|
|
457 "movq %1, %%mm1\n\t"
|
|
458 "movq %%mm0, %%mm2\n\t"
|
|
459 "movq %%mm1, %%mm3\n\t"
|
|
460 "punpcklbw %%mm7, %%mm0\n\t"
|
|
461 "punpcklbw %%mm7, %%mm1\n\t"
|
|
462 "punpckhbw %%mm7, %%mm2\n\t"
|
|
463 "punpckhbw %%mm7, %%mm3\n\t"
|
|
464 "paddusw %%mm1, %%mm0\n\t"
|
|
465 "paddusw %%mm3, %%mm2\n\t"
|
|
466 "paddusw %%mm6, %%mm0\n\t"
|
|
467 "paddusw %%mm6, %%mm2\n\t"
|
|
468 "psrlw $1, %%mm0\n\t"
|
|
469 "psrlw $1, %%mm2\n\t"
|
|
470 "packuswb %%mm2, %%mm0\n\t"
|
|
471 "movq %%mm0, %0\n\t"
|
|
472 :"=m"(*p)
|
|
473 :"m"(*pix)
|
|
474 :"memory");
|
|
475 pix += line_size;
|
|
476 p += line_size;
|
|
477 }
|
|
478 while (--h);
|
|
479 }
|
|
480
|
|
481 static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
|
|
482 {
|
|
483 UINT8 *p;
|
|
484 const UINT8 *pix;
|
|
485 p = block;
|
|
486 pix = pixels;
|
|
487 __asm __volatile(
|
|
488 "pxor %%mm7, %%mm7\n\t"
|
|
489 "movq %0, %%mm6\n\t"
|
8
|
490 ::"m"(mm_wone):"memory");
|
0
|
491 do {
|
|
492 __asm __volatile(
|
|
493 "movq %1, %%mm1\n\t"
|
|
494 "movq %0, %%mm0\n\t"
|
|
495 "movq 1%1, %%mm4\n\t"
|
|
496 "movq %%mm0, %%mm2\n\t"
|
|
497 "movq %%mm1, %%mm3\n\t"
|
|
498 "movq %%mm4, %%mm5\n\t"
|
|
499 "punpcklbw %%mm7, %%mm1\n\t"
|
|
500 "punpckhbw %%mm7, %%mm3\n\t"
|
|
501 "punpcklbw %%mm7, %%mm4\n\t"
|
|
502 "punpckhbw %%mm7, %%mm5\n\t"
|
|
503 "punpcklbw %%mm7, %%mm0\n\t"
|
|
504 "punpckhbw %%mm7, %%mm2\n\t"
|
|
505 "paddusw %%mm4, %%mm1\n\t"
|
|
506 "paddusw %%mm5, %%mm3\n\t"
|
|
507 "paddusw %%mm6, %%mm1\n\t"
|
|
508 "paddusw %%mm6, %%mm3\n\t"
|
|
509 "psrlw $1, %%mm1\n\t"
|
|
510 "psrlw $1, %%mm3\n\t"
|
|
511 "paddusw %%mm6, %%mm0\n\t"
|
|
512 "paddusw %%mm6, %%mm2\n\t"
|
|
513 "paddusw %%mm1, %%mm0\n\t"
|
|
514 "paddusw %%mm3, %%mm2\n\t"
|
|
515 "psrlw $1, %%mm0\n\t"
|
|
516 "psrlw $1, %%mm2\n\t"
|
|
517 "packuswb %%mm2, %%mm0\n\t"
|
|
518 "movq %%mm0, %0\n\t"
|
|
519 :"=m"(*p)
|
|
520 :"m"(*pix)
|
|
521 :"memory");
|
|
522 pix += line_size;
|
|
523 p += line_size;
|
|
524 } while (--h);
|
|
525 }
|
|
526
|
|
527 static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
|
|
528 {
|
|
529 UINT8 *p;
|
|
530 const UINT8 *pix;
|
|
531 p = block;
|
|
532 pix = pixels;
|
|
533 __asm __volatile(
|
|
534 "pxor %%mm7, %%mm7\n\t"
|
|
535 "movq %0, %%mm6\n\t"
|
8
|
536 ::"m"(mm_wone):"memory");
|
0
|
537 do {
|
|
538 __asm __volatile(
|
|
539 "movq %1, %%mm1\n\t"
|
|
540 "movq %0, %%mm0\n\t"
|
|
541 "movq %2, %%mm4\n\t"
|
|
542 "movq %%mm0, %%mm2\n\t"
|
|
543 "movq %%mm1, %%mm3\n\t"
|
|
544 "movq %%mm4, %%mm5\n\t"
|
|
545 "punpcklbw %%mm7, %%mm1\n\t"
|
|
546 "punpckhbw %%mm7, %%mm3\n\t"
|
|
547 "punpcklbw %%mm7, %%mm4\n\t"
|
|
548 "punpckhbw %%mm7, %%mm5\n\t"
|
|
549 "punpcklbw %%mm7, %%mm0\n\t"
|
|
550 "punpckhbw %%mm7, %%mm2\n\t"
|
|
551 "paddusw %%mm4, %%mm1\n\t"
|
|
552 "paddusw %%mm5, %%mm3\n\t"
|
|
553 "paddusw %%mm6, %%mm1\n\t"
|
|
554 "paddusw %%mm6, %%mm3\n\t"
|
|
555 "psrlw $1, %%mm1\n\t"
|
|
556 "psrlw $1, %%mm3\n\t"
|
|
557 "paddusw %%mm6, %%mm0\n\t"
|
|
558 "paddusw %%mm6, %%mm2\n\t"
|
|
559 "paddusw %%mm1, %%mm0\n\t"
|
|
560 "paddusw %%mm3, %%mm2\n\t"
|
|
561 "psrlw $1, %%mm0\n\t"
|
|
562 "psrlw $1, %%mm2\n\t"
|
|
563 "packuswb %%mm2, %%mm0\n\t"
|
|
564 "movq %%mm0, %0\n\t"
|
|
565 :"=m"(*p)
|
|
566 :"m"(*pix), "m"(*(pix+line_size))
|
|
567 :"memory");
|
|
568 pix += line_size;
|
|
569 p += line_size ;
|
|
570 } while(--h);
|
|
571 }
|
|
572
|
|
573 static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
|
|
574 {
|
|
575 UINT8 *p;
|
|
576 const UINT8 *pix;
|
|
577 p = block;
|
|
578 pix = pixels;
|
|
579 __asm __volatile(
|
|
580 "pxor %%mm7, %%mm7\n\t"
|
|
581 "movq %0, %%mm6\n\t"
|
8
|
582 ::"m"(mm_wtwo):"memory");
|
0
|
583 do {
|
|
584 __asm __volatile(
|
|
585 "movq %1, %%mm0\n\t"
|
|
586 "movq %2, %%mm1\n\t"
|
|
587 "movq 1%1, %%mm4\n\t"
|
|
588 "movq 1%2, %%mm5\n\t"
|
|
589 "movq %%mm0, %%mm2\n\t"
|
|
590 "movq %%mm1, %%mm3\n\t"
|
|
591 "punpcklbw %%mm7, %%mm0\n\t"
|
|
592 "punpcklbw %%mm7, %%mm1\n\t"
|
|
593 "punpckhbw %%mm7, %%mm2\n\t"
|
|
594 "punpckhbw %%mm7, %%mm3\n\t"
|
|
595 "paddusw %%mm1, %%mm0\n\t"
|
|
596 "paddusw %%mm3, %%mm2\n\t"
|
|
597 "movq %%mm4, %%mm1\n\t"
|
|
598 "movq %%mm5, %%mm3\n\t"
|
|
599 "punpcklbw %%mm7, %%mm4\n\t"
|
|
600 "punpcklbw %%mm7, %%mm5\n\t"
|
|
601 "punpckhbw %%mm7, %%mm1\n\t"
|
|
602 "punpckhbw %%mm7, %%mm3\n\t"
|
|
603 "paddusw %%mm5, %%mm4\n\t"
|
|
604 "paddusw %%mm3, %%mm1\n\t"
|
|
605 "paddusw %%mm6, %%mm4\n\t"
|
|
606 "paddusw %%mm6, %%mm1\n\t"
|
|
607 "paddusw %%mm4, %%mm0\n\t"
|
|
608 "paddusw %%mm1, %%mm2\n\t"
|
|
609 "movq %3, %%mm5\n\t"
|
|
610 "psrlw $2, %%mm0\n\t"
|
|
611 "movq %0, %%mm1\n\t"
|
|
612 "psrlw $2, %%mm2\n\t"
|
|
613 "movq %%mm1, %%mm3\n\t"
|
|
614 "punpcklbw %%mm7, %%mm1\n\t"
|
|
615 "punpckhbw %%mm7, %%mm3\n\t"
|
|
616 "paddusw %%mm1, %%mm0\n\t"
|
|
617 "paddusw %%mm3, %%mm2\n\t"
|
|
618 "paddusw %%mm5, %%mm0\n\t"
|
|
619 "paddusw %%mm5, %%mm2\n\t"
|
|
620 "psrlw $1, %%mm0\n\t"
|
|
621 "psrlw $1, %%mm2\n\t"
|
|
622 "packuswb %%mm2, %%mm0\n\t"
|
|
623 "movq %%mm0, %0\n\t"
|
|
624 :"=m"(*p)
|
|
625 :"m"(*pix),
|
8
|
626 "m"(*(pix+line_size)), "m"(mm_wone)
|
0
|
627 :"memory");
|
|
628 pix += line_size;
|
|
629 p += line_size ;
|
|
630 } while(--h);
|
|
631 }
|
|
632
|
|
633 static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
|
|
634 {
|
|
635 UINT8 *p;
|
|
636 const UINT8 *pix;
|
|
637 p = block;
|
|
638 pix = pixels;
|
|
639 __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory");
|
|
640 do {
|
|
641 __asm __volatile(
|
|
642 "movq %1, %%mm0\n\t"
|
|
643 "movq %0, %%mm1\n\t"
|
|
644 "movq %%mm0, %%mm2\n\t"
|
|
645 "movq %%mm1, %%mm3\n\t"
|
|
646 "punpcklbw %%mm7, %%mm0\n\t"
|
|
647 "punpcklbw %%mm7, %%mm1\n\t"
|
|
648 "punpckhbw %%mm7, %%mm2\n\t"
|
|
649 "punpckhbw %%mm7, %%mm3\n\t"
|
|
650 "paddusw %%mm1, %%mm0\n\t"
|
|
651 "paddusw %%mm3, %%mm2\n\t"
|
|
652 "psrlw $1, %%mm0\n\t"
|
|
653 "psrlw $1, %%mm2\n\t"
|
|
654 "packuswb %%mm2, %%mm0\n\t"
|
|
655 "movq %%mm0, %0\n\t"
|
|
656 :"=m"(*p)
|
|
657 :"m"(*pix)
|
|
658 :"memory");
|
|
659 pix += line_size;
|
|
660 p += line_size ;
|
|
661 } while (--h);
|
|
662 }
|
|
663
|
|
664 static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
|
|
665 {
|
|
666 UINT8 *p;
|
|
667 const UINT8 *pix;
|
|
668 p = block;
|
|
669 pix = pixels;
|
|
670 __asm __volatile(
|
|
671 "pxor %%mm7, %%mm7\n\t":::"memory");
|
|
672 do {
|
|
673 __asm __volatile(
|
|
674 "movq %1, %%mm0\n\t"
|
|
675 "movq 1%1, %%mm1\n\t"
|
|
676 "movq %0, %%mm4\n\t"
|
|
677 "movq %%mm0, %%mm2\n\t"
|
|
678 "movq %%mm1, %%mm3\n\t"
|
|
679 "movq %%mm4, %%mm5\n\t"
|
|
680 "punpcklbw %%mm7, %%mm0\n\t"
|
|
681 "punpcklbw %%mm7, %%mm1\n\t"
|
|
682 "punpckhbw %%mm7, %%mm2\n\t"
|
|
683 "punpckhbw %%mm7, %%mm3\n\t"
|
|
684 "punpcklbw %%mm7, %%mm4\n\t"
|
|
685 "punpckhbw %%mm7, %%mm5\n\t"
|
|
686 "paddusw %%mm1, %%mm0\n\t"
|
|
687 "paddusw %%mm3, %%mm2\n\t"
|
|
688 "psrlw $1, %%mm0\n\t"
|
|
689 "psrlw $1, %%mm2\n\t"
|
|
690 "paddusw %%mm4, %%mm0\n\t"
|
|
691 "paddusw %%mm5, %%mm2\n\t"
|
|
692 "psrlw $1, %%mm0\n\t"
|
|
693 "psrlw $1, %%mm2\n\t"
|
|
694 "packuswb %%mm2, %%mm0\n\t"
|
|
695 "movq %%mm0, %0\n\t"
|
|
696 :"=m"(*p)
|
|
697 :"m"(*pix)
|
|
698 :"memory");
|
|
699 pix += line_size;
|
|
700 p += line_size;
|
|
701 } while (--h);
|
|
702 }
|
|
703
|
|
704 static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
|
|
705 {
|
|
706 UINT8 *p;
|
|
707 const UINT8 *pix;
|
|
708 p = block;
|
|
709 pix = pixels;
|
|
710 __asm __volatile(
|
|
711 "pxor %%mm7, %%mm7\n\t":::"memory");
|
|
712 do {
|
|
713 __asm __volatile(
|
|
714 "movq %1, %%mm0\n\t"
|
|
715 "movq %2, %%mm1\n\t"
|
|
716 "movq %0, %%mm4\n\t"
|
|
717 "movq %%mm0, %%mm2\n\t"
|
|
718 "movq %%mm1, %%mm3\n\t"
|
|
719 "movq %%mm4, %%mm5\n\t"
|
|
720 "punpcklbw %%mm7, %%mm0\n\t"
|
|
721 "punpcklbw %%mm7, %%mm1\n\t"
|
|
722 "punpckhbw %%mm7, %%mm2\n\t"
|
|
723 "punpckhbw %%mm7, %%mm3\n\t"
|
|
724 "punpcklbw %%mm7, %%mm4\n\t"
|
|
725 "punpckhbw %%mm7, %%mm5\n\t"
|
|
726 "paddusw %%mm1, %%mm0\n\t"
|
|
727 "paddusw %%mm3, %%mm2\n\t"
|
|
728 "psrlw $1, %%mm0\n\t"
|
|
729 "psrlw $1, %%mm2\n\t"
|
|
730 "paddusw %%mm4, %%mm0\n\t"
|
|
731 "paddusw %%mm5, %%mm2\n\t"
|
|
732 "psrlw $1, %%mm0\n\t"
|
|
733 "psrlw $1, %%mm2\n\t"
|
|
734 "packuswb %%mm2, %%mm0\n\t"
|
|
735 "movq %%mm0, %0\n\t"
|
|
736 :"=m"(*p)
|
|
737 :"m"(*pix), "m"(*(pix+line_size))
|
|
738 :"memory");
|
|
739 pix += line_size;
|
|
740 p += line_size ;
|
|
741 } while(--h);
|
|
742 }
|
|
743
|
|
744 static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
|
|
745 {
|
|
746 UINT8 *p;
|
|
747 const UINT8 *pix;
|
|
748 p = block;
|
|
749 pix = pixels;
|
|
750 __asm __volatile(
|
|
751 "pxor %%mm7, %%mm7\n\t"
|
|
752 "movq %0, %%mm6\n\t"
|
8
|
753 ::"m"(mm_wone):"memory");
|
0
|
754 do {
|
|
755 __asm __volatile(
|
|
756 "movq %1, %%mm0\n\t"
|
|
757 "movq %2, %%mm1\n\t"
|
|
758 "movq 1%1, %%mm4\n\t"
|
|
759 "movq 1%2, %%mm5\n\t"
|
|
760 "movq %%mm0, %%mm2\n\t"
|
|
761 "movq %%mm1, %%mm3\n\t"
|
|
762 "punpcklbw %%mm7, %%mm0\n\t"
|
|
763 "punpcklbw %%mm7, %%mm1\n\t"
|
|
764 "punpckhbw %%mm7, %%mm2\n\t"
|
|
765 "punpckhbw %%mm7, %%mm3\n\t"
|
|
766 "paddusw %%mm1, %%mm0\n\t"
|
|
767 "paddusw %%mm3, %%mm2\n\t"
|
|
768 "movq %%mm4, %%mm1\n\t"
|
|
769 "movq %%mm5, %%mm3\n\t"
|
|
770 "punpcklbw %%mm7, %%mm4\n\t"
|
|
771 "punpcklbw %%mm7, %%mm5\n\t"
|
|
772 "punpckhbw %%mm7, %%mm1\n\t"
|
|
773 "punpckhbw %%mm7, %%mm3\n\t"
|
|
774 "paddusw %%mm5, %%mm4\n\t"
|
|
775 "paddusw %%mm3, %%mm1\n\t"
|
|
776 "paddusw %%mm6, %%mm4\n\t"
|
|
777 "paddusw %%mm6, %%mm1\n\t"
|
|
778 "paddusw %%mm4, %%mm0\n\t"
|
|
779 "paddusw %%mm1, %%mm2\n\t"
|
|
780 "movq %0, %%mm1\n\t"
|
|
781 "psrlw $2, %%mm0\n\t"
|
|
782 "movq %%mm1, %%mm3\n\t"
|
|
783 "psrlw $2, %%mm2\n\t"
|
|
784 "punpcklbw %%mm7, %%mm1\n\t"
|
|
785 "punpckhbw %%mm7, %%mm3\n\t"
|
|
786 "paddusw %%mm1, %%mm0\n\t"
|
|
787 "paddusw %%mm3, %%mm2\n\t"
|
|
788 "psrlw $1, %%mm0\n\t"
|
|
789 "psrlw $1, %%mm2\n\t"
|
|
790 "packuswb %%mm2, %%mm0\n\t"
|
|
791 "movq %%mm0, %0\n\t"
|
|
792 :"=m"(*p)
|
|
793 :"m"(*pix),
|
|
794 "m"(*(pix+line_size))
|
|
795 :"memory");
|
|
796 pix += line_size;
|
|
797 p += line_size;
|
|
798 } while(--h);
|
|
799 }
|
|
800
|
|
801 static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
|
|
802 {
|
|
803 DCTELEM *p;
|
|
804 const UINT8 *pix;
|
|
805 p = block;
|
|
806 pix = pixels;
|
|
807 __asm __volatile("pxor %%mm7, %%mm7":::"memory");
|
|
808 do {
|
|
809 __asm __volatile(
|
|
810 "movq %0, %%mm0\n\t"
|
|
811 "movq %1, %%mm2\n\t"
|
|
812 "movq 8%0, %%mm1\n\t"
|
|
813 "movq %%mm2, %%mm3\n\t"
|
|
814 "punpcklbw %%mm7, %%mm2\n\t"
|
|
815 "punpckhbw %%mm7, %%mm3\n\t"
|
|
816 "psubsw %%mm2, %%mm0\n\t"
|
|
817 "psubsw %%mm3, %%mm1\n\t"
|
|
818 "movq %%mm0, %0\n\t"
|
|
819 "movq %%mm1, 8%0\n\t"
|
|
820 :"=m"(*p)
|
|
821 :"m"(*pix)
|
|
822 :"memory");
|
|
823 pix += line_size;
|
|
824 p += 8;
|
|
825 } while (--h);
|
|
826 }
|
|
827
|
|
828 static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
|
|
829 {
|
|
830 DCTELEM *p;
|
|
831 const UINT8 *pix;
|
|
832 p = block;
|
|
833 pix = pixels;
|
|
834 __asm __volatile(
|
|
835 "pxor %%mm7, %%mm7\n\t"
|
|
836 "movq %0, %%mm6"
|
8
|
837 ::"m"(mm_wone):"memory");
|
0
|
838 do {
|
|
839 __asm __volatile(
|
|
840 "movq %0, %%mm0\n\t"
|
|
841 "movq %1, %%mm2\n\t"
|
|
842 "movq 8%0, %%mm1\n\t"
|
|
843 "movq 1%1, %%mm4\n\t"
|
|
844 "movq %%mm2, %%mm3\n\t"
|
|
845 "movq %%mm4, %%mm5\n\t"
|
|
846 "punpcklbw %%mm7, %%mm2\n\t"
|
|
847 "punpckhbw %%mm7, %%mm3\n\t"
|
|
848 "punpcklbw %%mm7, %%mm4\n\t"
|
|
849 "punpckhbw %%mm7, %%mm5\n\t"
|
|
850 "paddusw %%mm4, %%mm2\n\t"
|
|
851 "paddusw %%mm5, %%mm3\n\t"
|
|
852 "paddusw %%mm6, %%mm2\n\t"
|
|
853 "paddusw %%mm6, %%mm3\n\t"
|
|
854 "psrlw $1, %%mm2\n\t"
|
|
855 "psrlw $1, %%mm3\n\t"
|
|
856 "psubsw %%mm2, %%mm0\n\t"
|
|
857 "psubsw %%mm3, %%mm1\n\t"
|
|
858 "movq %%mm0, %0\n\t"
|
|
859 "movq %%mm1, 8%0\n\t"
|
|
860 :"=m"(*p)
|
|
861 :"m"(*pix)
|
|
862 :"memory");
|
|
863 pix += line_size;
|
|
864 p += 8;
|
|
865 } while (--h);
|
|
866 }
|
|
867
|
|
868 static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
|
|
869 {
|
|
870 DCTELEM *p;
|
|
871 const UINT8 *pix;
|
|
872 p = block;
|
|
873 pix = pixels;
|
|
874 __asm __volatile(
|
|
875 "pxor %%mm7, %%mm7\n\t"
|
|
876 "movq %0, %%mm6"
|
8
|
877 ::"m"(mm_wone):"memory");
|
0
|
878 do {
|
|
879 __asm __volatile(
|
|
880 "movq %0, %%mm0\n\t"
|
|
881 "movq %1, %%mm2\n\t"
|
|
882 "movq 8%0, %%mm1\n\t"
|
|
883 "movq %2, %%mm4\n\t"
|
|
884 "movq %%mm2, %%mm3\n\t"
|
|
885 "movq %%mm4, %%mm5\n\t"
|
|
886 "punpcklbw %%mm7, %%mm2\n\t"
|
|
887 "punpckhbw %%mm7, %%mm3\n\t"
|
|
888 "punpcklbw %%mm7, %%mm4\n\t"
|
|
889 "punpckhbw %%mm7, %%mm5\n\t"
|
|
890 "paddusw %%mm4, %%mm2\n\t"
|
|
891 "paddusw %%mm5, %%mm3\n\t"
|
|
892 "paddusw %%mm6, %%mm2\n\t"
|
|
893 "paddusw %%mm6, %%mm3\n\t"
|
|
894 "psrlw $1, %%mm2\n\t"
|
|
895 "psrlw $1, %%mm3\n\t"
|
|
896 "psubsw %%mm2, %%mm0\n\t"
|
|
897 "psubsw %%mm3, %%mm1\n\t"
|
|
898 "movq %%mm0, %0\n\t"
|
|
899 "movq %%mm1, 8%0\n\t"
|
|
900 :"=m"(*p)
|
|
901 :"m"(*pix), "m"(*(pix+line_size))
|
|
902 :"memory");
|
|
903 pix += line_size;
|
|
904 p += 8;
|
|
905 } while (--h);
|
|
906 }
|
|
907
|
|
908 static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
|
|
909 {
|
|
910 DCTELEM *p;
|
|
911 const UINT8 *pix;
|
|
912 p = block;
|
|
913 pix = pixels;
|
|
914 __asm __volatile(
|
|
915 "pxor %%mm7, %%mm7\n\t"
|
|
916 "movq %0, %%mm6\n\t"
|
8
|
917 ::"m"(mm_wtwo):"memory");
|
0
|
918 do {
|
|
919 __asm __volatile(
|
|
920 "movq %1, %%mm0\n\t"
|
|
921 "movq %2, %%mm1\n\t"
|
|
922 "movq 1%1, %%mm4\n\t"
|
|
923 "movq 1%2, %%mm5\n\t"
|
|
924 "movq %%mm0, %%mm2\n\t"
|
|
925 "movq %%mm1, %%mm3\n\t"
|
|
926 "punpcklbw %%mm7, %%mm0\n\t"
|
|
927 "punpcklbw %%mm7, %%mm1\n\t"
|
|
928 "punpckhbw %%mm7, %%mm2\n\t"
|
|
929 "punpckhbw %%mm7, %%mm3\n\t"
|
|
930 "paddusw %%mm1, %%mm0\n\t"
|
|
931 "paddusw %%mm3, %%mm2\n\t"
|
|
932 "movq %%mm4, %%mm1\n\t"
|
|
933 "movq %%mm5, %%mm3\n\t"
|
|
934 "punpcklbw %%mm7, %%mm4\n\t"
|
|
935 "punpcklbw %%mm7, %%mm5\n\t"
|
|
936 "punpckhbw %%mm7, %%mm1\n\t"
|
|
937 "punpckhbw %%mm7, %%mm3\n\t"
|
|
938 "paddusw %%mm5, %%mm4\n\t"
|
|
939 "paddusw %%mm3, %%mm1\n\t"
|
|
940 "paddusw %%mm6, %%mm4\n\t"
|
|
941 "paddusw %%mm6, %%mm1\n\t"
|
|
942 "paddusw %%mm4, %%mm0\n\t"
|
|
943 "paddusw %%mm1, %%mm2\n\t"
|
|
944 "movq %0, %%mm1\n\t"
|
|
945 "movq 8%0, %%mm3\n\t"
|
|
946 "psrlw $2, %%mm0\n\t"
|
|
947 "psrlw $2, %%mm2\n\t"
|
|
948 "psubsw %%mm0, %%mm1\n\t"
|
|
949 "psubsw %%mm2, %%mm3\n\t"
|
|
950 "movq %%mm1, %0\n\t"
|
|
951 "movq %%mm3, 8%0\n\t"
|
|
952 :"=m"(*p)
|
|
953 :"m"(*pix),
|
|
954 "m"(*(pix+line_size))
|
|
955 :"memory");
|
|
956 pix += line_size;
|
|
957 p += 8 ;
|
|
958 } while(--h);
|
|
959 }
|
|
960
|
|
961 void dsputil_init_mmx(void)
|
|
962 {
|
|
963 mm_flags = mm_support();
|
|
964 #if 0
|
|
965 printf("CPU flags:");
|
|
966 if (mm_flags & MM_MMX)
|
|
967 printf(" mmx");
|
|
968 if (mm_flags & MM_MMXEXT)
|
|
969 printf(" mmxext");
|
|
970 if (mm_flags & MM_3DNOW)
|
|
971 printf(" 3dnow");
|
|
972 if (mm_flags & MM_SSE)
|
|
973 printf(" sse");
|
|
974 if (mm_flags & MM_SSE2)
|
|
975 printf(" sse2");
|
|
976 printf("\n");
|
|
977 #endif
|
|
978
|
|
979 if (mm_flags & MM_MMX) {
|
|
980 get_pixels = get_pixels_mmx;
|
|
981 put_pixels_clamped = put_pixels_clamped_mmx;
|
|
982 add_pixels_clamped = add_pixels_clamped_mmx;
|
|
983
|
|
984 pix_abs16x16 = pix_abs16x16_mmx;
|
|
985 pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
|
|
986 pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
|
|
987 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
|
|
988 av_fdct = fdct_mmx;
|
|
989
|
|
990 put_pixels_tab[0] = put_pixels_mmx;
|
|
991 put_pixels_tab[1] = put_pixels_x2_mmx;
|
|
992 put_pixels_tab[2] = put_pixels_y2_mmx;
|
|
993 put_pixels_tab[3] = put_pixels_xy2_mmx;
|
|
994
|
|
995 put_no_rnd_pixels_tab[0] = put_pixels_mmx;
|
|
996 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
|
|
997 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
|
|
998 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
|
|
999
|
|
1000 avg_pixels_tab[0] = avg_pixels_mmx;
|
|
1001 avg_pixels_tab[1] = avg_pixels_x2_mmx;
|
|
1002 avg_pixels_tab[2] = avg_pixels_y2_mmx;
|
|
1003 avg_pixels_tab[3] = avg_pixels_xy2_mmx;
|
|
1004
|
|
1005 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
|
|
1006 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
|
|
1007 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
|
|
1008 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
|
|
1009
|
|
1010 sub_pixels_tab[0] = sub_pixels_mmx;
|
|
1011 sub_pixels_tab[1] = sub_pixels_x2_mmx;
|
|
1012 sub_pixels_tab[2] = sub_pixels_y2_mmx;
|
|
1013 sub_pixels_tab[3] = sub_pixels_xy2_mmx;
|
|
1014
|
|
1015 if (mm_flags & MM_MMXEXT) {
|
|
1016 pix_abs16x16 = pix_abs16x16_sse;
|
|
1017 }
|
|
1018
|
|
1019 if (mm_flags & MM_SSE) {
|
|
1020 put_pixels_tab[1] = put_pixels_x2_sse;
|
|
1021 put_pixels_tab[2] = put_pixels_y2_sse;
|
|
1022
|
|
1023 avg_pixels_tab[0] = avg_pixels_sse;
|
|
1024 avg_pixels_tab[1] = avg_pixels_x2_sse;
|
|
1025 avg_pixels_tab[2] = avg_pixels_y2_sse;
|
|
1026 avg_pixels_tab[3] = avg_pixels_xy2_sse;
|
|
1027
|
|
1028 sub_pixels_tab[1] = sub_pixels_x2_sse;
|
|
1029 sub_pixels_tab[2] = sub_pixels_y2_sse;
|
|
1030 } else if (mm_flags & MM_3DNOW) {
|
|
1031 put_pixels_tab[1] = put_pixels_x2_3dnow;
|
|
1032 put_pixels_tab[2] = put_pixels_y2_3dnow;
|
|
1033
|
|
1034 avg_pixels_tab[0] = avg_pixels_3dnow;
|
|
1035 avg_pixels_tab[1] = avg_pixels_x2_3dnow;
|
|
1036 avg_pixels_tab[2] = avg_pixels_y2_3dnow;
|
|
1037 avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
|
|
1038
|
|
1039 sub_pixels_tab[1] = sub_pixels_x2_3dnow;
|
|
1040 sub_pixels_tab[2] = sub_pixels_y2_3dnow;
|
|
1041 }
|
|
1042 }
|
|
1043 }
|