Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 39:f6806d3e2d37 libavcodec
updated scans
author | glantau |
---|---|
date | Tue, 07 Aug 2001 22:47:28 +0000 |
parents | 82d4c9be9873 |
children | 8068c4bce9c1 |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
3 * Copyright (c) 2000, 2001 Gerard Lantau. | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
18 * | |
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 */ | |
21 | |
22 #include "../dsputil.h" | |
23 | |
5 | 24 int mm_flags; /* multimedia extension flags */ |
25 | |
0 | 26 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); |
27 int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
28 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
29 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
30 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
31 | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
32 #ifdef USE_MMX_IDCT |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
33 /* external functions, defined in libmpeg2 */ |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
34 void mmx_idct(DCTELEM *block); |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
35 void mmxext_idct(DCTELEM *block); |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
36 /* this should be in dsputil.h? -- A'rpi */ |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
37 extern UINT8 ff_alternate_horizontal_scan[64]; |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
38 extern UINT8 ff_alternate_vertical_scan[64]; |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
39 extern UINT8 zigzag_direct[64]; |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
40 #endif |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
41 |
0 | 42 /* pixel operations */ |
8 | 43 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; |
44 static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; | |
45 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; | |
46 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |
0 | 47 |
48 /***********************************/ | |
49 /* 3Dnow specific */ | |
50 | |
51 #define DEF(x) x ## _3dnow | |
52 /* for Athlons PAVGUSB is prefered */ | |
53 #define PAVGB "pavgusb" | |
54 | |
55 #include "dsputil_mmx_avg.h" | |
56 | |
57 #undef DEF | |
58 #undef PAVGB | |
59 | |
60 /***********************************/ | |
61 /* MMX2 specific */ | |
62 | |
63 #define DEF(x) x ## _sse | |
64 | |
65 /* Introduced only in MMX2 set */ | |
66 #define PAVGB "pavgb" | |
67 | |
68 #include "dsputil_mmx_avg.h" | |
69 | |
70 #undef DEF | |
71 #undef PAVGB | |
72 | |
73 /***********************************/ | |
74 /* standard MMX */ | |
75 | |
76 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
77 { | |
78 DCTELEM *p; | |
79 const UINT8 *pix; | |
80 int i; | |
81 | |
82 /* read the pixels */ | |
83 p = block; | |
84 pix = pixels; | |
85 __asm __volatile("pxor %%mm7, %%mm7":::"memory"); | |
86 for(i=0;i<4;i++) { | |
87 __asm __volatile( | |
88 "movq %1, %%mm0\n\t" | |
89 "movq %2, %%mm1\n\t" | |
90 "movq %%mm0, %%mm2\n\t" | |
91 "movq %%mm1, %%mm3\n\t" | |
92 "punpcklbw %%mm7, %%mm0\n\t" | |
93 "punpckhbw %%mm7, %%mm2\n\t" | |
94 "punpcklbw %%mm7, %%mm1\n\t" | |
95 "punpckhbw %%mm7, %%mm3\n\t" | |
96 "movq %%mm0, %0\n\t" | |
97 "movq %%mm2, 8%0\n\t" | |
98 "movq %%mm1, 16%0\n\t" | |
99 "movq %%mm3, 24%0\n\t" | |
100 :"=m"(*p) | |
101 :"m"(*pix), "m"(*(pix+line_size)) | |
102 :"memory"); | |
103 pix += line_size*2; | |
104 p += 16; | |
105 } | |
106 } | |
107 | |
108 static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
109 { | |
110 const DCTELEM *p; | |
111 UINT8 *pix; | |
112 int i; | |
113 | |
114 /* read the pixels */ | |
115 p = block; | |
116 pix = pixels; | |
117 for(i=0;i<2;i++) { | |
118 __asm __volatile( | |
119 "movq %4, %%mm0\n\t" | |
120 "movq 8%4, %%mm1\n\t" | |
121 "movq 16%4, %%mm2\n\t" | |
122 "movq 24%4, %%mm3\n\t" | |
123 "movq 32%4, %%mm4\n\t" | |
124 "movq 40%4, %%mm5\n\t" | |
125 "movq 48%4, %%mm6\n\t" | |
126 "movq 56%4, %%mm7\n\t" | |
127 "packuswb %%mm1, %%mm0\n\t" | |
128 "packuswb %%mm3, %%mm2\n\t" | |
129 "packuswb %%mm5, %%mm4\n\t" | |
130 "packuswb %%mm7, %%mm6\n\t" | |
131 "movq %%mm0, %0\n\t" | |
132 "movq %%mm2, %1\n\t" | |
133 "movq %%mm4, %2\n\t" | |
134 "movq %%mm6, %3\n\t" | |
135 :"=m"(*pix), "=m"(*(pix+line_size)) | |
136 ,"=m"(*(pix+line_size*2)), "=m"(*(pix+line_size*3)) | |
137 :"m"(*p) | |
138 :"memory"); | |
139 pix += line_size*4; | |
140 p += 32; | |
141 } | |
142 } | |
143 | |
144 static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
145 { | |
146 const DCTELEM *p; | |
147 UINT8 *pix; | |
148 int i; | |
149 | |
150 /* read the pixels */ | |
151 p = block; | |
152 pix = pixels; | |
153 __asm __volatile("pxor %%mm7, %%mm7":::"memory"); | |
154 for(i=0;i<4;i++) { | |
155 __asm __volatile( | |
156 "movq %2, %%mm0\n\t" | |
157 "movq 8%2, %%mm1\n\t" | |
158 "movq 16%2, %%mm2\n\t" | |
159 "movq 24%2, %%mm3\n\t" | |
160 "movq %0, %%mm4\n\t" | |
161 "movq %1, %%mm6\n\t" | |
162 "movq %%mm4, %%mm5\n\t" | |
163 "punpcklbw %%mm7, %%mm4\n\t" | |
164 "punpckhbw %%mm7, %%mm5\n\t" | |
165 "paddsw %%mm4, %%mm0\n\t" | |
166 "paddsw %%mm5, %%mm1\n\t" | |
167 "movq %%mm6, %%mm5\n\t" | |
168 "punpcklbw %%mm7, %%mm6\n\t" | |
169 "punpckhbw %%mm7, %%mm5\n\t" | |
170 "paddsw %%mm6, %%mm2\n\t" | |
171 "paddsw %%mm5, %%mm3\n\t" | |
172 "packuswb %%mm1, %%mm0\n\t" | |
173 "packuswb %%mm3, %%mm2\n\t" | |
174 "movq %%mm0, %0\n\t" | |
175 "movq %%mm2, %1\n\t" | |
176 :"=m"(*pix), "=m"(*(pix+line_size)) | |
177 :"m"(*p) | |
178 :"memory"); | |
179 pix += line_size*2; | |
180 p += 16; | |
181 } | |
182 } | |
183 | |
184 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
185 { | |
186 int dh, hh; | |
187 UINT8 *p; | |
188 const UINT8 *pix; | |
189 p = block; | |
190 pix = pixels; | |
191 hh=h>>2; | |
192 dh=h&3; | |
193 while(hh--) { | |
194 __asm __volatile( | |
195 "movq %4, %%mm0\n\t" | |
196 "movq %5, %%mm1\n\t" | |
197 "movq %6, %%mm2\n\t" | |
198 "movq %7, %%mm3\n\t" | |
199 "movq %%mm0, %0\n\t" | |
200 "movq %%mm1, %1\n\t" | |
201 "movq %%mm2, %2\n\t" | |
202 "movq %%mm3, %3\n\t" | |
203 :"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3)) | |
204 :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3)) | |
205 :"memory"); | |
206 pix = pix + line_size*4; | |
207 p = p + line_size*4; | |
208 } | |
209 while(dh--) { | |
210 __asm __volatile( | |
211 "movq %1, %%mm0\n\t" | |
212 "movq %%mm0, %0\n\t" | |
213 :"=m"(*p) | |
214 :"m"(*pix) | |
215 :"memory"); | |
216 pix = pix + line_size; | |
217 p = p + line_size; | |
218 } | |
219 } | |
220 | |
221 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
222 { | |
223 UINT8 *p; | |
224 const UINT8 *pix; | |
225 p = block; | |
226 pix = pixels; | |
227 __asm __volatile( | |
228 "pxor %%mm7, %%mm7\n\t" | |
229 "movq %0, %%mm4\n\t" | |
8 | 230 ::"m"(mm_wone):"memory"); |
0 | 231 do { |
232 __asm __volatile( | |
233 "movq %1, %%mm0\n\t" | |
234 "movq 1%1, %%mm1\n\t" | |
235 "movq %%mm0, %%mm2\n\t" | |
236 "movq %%mm1, %%mm3\n\t" | |
237 "punpcklbw %%mm7, %%mm0\n\t" | |
238 "punpcklbw %%mm7, %%mm1\n\t" | |
239 "punpckhbw %%mm7, %%mm2\n\t" | |
240 "punpckhbw %%mm7, %%mm3\n\t" | |
241 "paddusw %%mm1, %%mm0\n\t" | |
242 "paddusw %%mm3, %%mm2\n\t" | |
243 "paddusw %%mm4, %%mm0\n\t" | |
244 "paddusw %%mm4, %%mm2\n\t" | |
245 "psrlw $1, %%mm0\n\t" | |
246 "psrlw $1, %%mm2\n\t" | |
247 "packuswb %%mm2, %%mm0\n\t" | |
248 "movq %%mm0, %0\n\t" | |
249 :"=m"(*p) | |
250 :"m"(*pix) | |
251 :"memory"); | |
252 pix += line_size; p += line_size; | |
253 } while (--h); | |
254 } | |
255 | |
256 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
257 { | |
258 UINT8 *p; | |
259 const UINT8 *pix; | |
260 p = block; | |
261 pix = pixels; | |
262 __asm __volatile( | |
263 "pxor %%mm7, %%mm7\n\t" | |
264 "movq %0, %%mm4\n\t" | |
8 | 265 ::"m"(mm_wone):"memory"); |
0 | 266 do { |
267 __asm __volatile( | |
268 "movq %1, %%mm0\n\t" | |
269 "movq %2, %%mm1\n\t" | |
270 "movq %%mm0, %%mm2\n\t" | |
271 "movq %%mm1, %%mm3\n\t" | |
272 "punpcklbw %%mm7, %%mm0\n\t" | |
273 "punpcklbw %%mm7, %%mm1\n\t" | |
274 "punpckhbw %%mm7, %%mm2\n\t" | |
275 "punpckhbw %%mm7, %%mm3\n\t" | |
276 "paddusw %%mm1, %%mm0\n\t" | |
277 "paddusw %%mm3, %%mm2\n\t" | |
278 "paddusw %%mm4, %%mm0\n\t" | |
279 "paddusw %%mm4, %%mm2\n\t" | |
280 "psrlw $1, %%mm0\n\t" | |
281 "psrlw $1, %%mm2\n\t" | |
282 "packuswb %%mm2, %%mm0\n\t" | |
283 "movq %%mm0, %0\n\t" | |
284 :"=m"(*p) | |
285 :"m"(*pix), | |
286 "m"(*(pix+line_size)) | |
287 :"memory"); | |
288 pix += line_size; | |
289 p += line_size; | |
290 } while (--h); | |
291 } | |
292 | |
293 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
294 { | |
295 UINT8 *p; | |
296 const UINT8 *pix; | |
297 p = block; | |
298 pix = pixels; | |
299 __asm __volatile( | |
300 "pxor %%mm7, %%mm7\n\t" | |
301 "movq %0, %%mm6\n\t" | |
8 | 302 ::"m"(mm_wtwo):"memory"); |
0 | 303 do { |
304 __asm __volatile( | |
305 "movq %1, %%mm0\n\t" | |
306 "movq %2, %%mm1\n\t" | |
307 "movq 1%1, %%mm4\n\t" | |
308 "movq 1%2, %%mm5\n\t" | |
309 "movq %%mm0, %%mm2\n\t" | |
310 "movq %%mm1, %%mm3\n\t" | |
311 "punpcklbw %%mm7, %%mm0\n\t" | |
312 "punpcklbw %%mm7, %%mm1\n\t" | |
313 "punpckhbw %%mm7, %%mm2\n\t" | |
314 "punpckhbw %%mm7, %%mm3\n\t" | |
315 "paddusw %%mm1, %%mm0\n\t" | |
316 "paddusw %%mm3, %%mm2\n\t" | |
317 "movq %%mm4, %%mm1\n\t" | |
318 "movq %%mm5, %%mm3\n\t" | |
319 "punpcklbw %%mm7, %%mm4\n\t" | |
320 "punpcklbw %%mm7, %%mm5\n\t" | |
321 "punpckhbw %%mm7, %%mm1\n\t" | |
322 "punpckhbw %%mm7, %%mm3\n\t" | |
323 "paddusw %%mm5, %%mm4\n\t" | |
324 "paddusw %%mm3, %%mm1\n\t" | |
325 "paddusw %%mm6, %%mm4\n\t" | |
326 "paddusw %%mm6, %%mm1\n\t" | |
327 "paddusw %%mm4, %%mm0\n\t" | |
328 "paddusw %%mm1, %%mm2\n\t" | |
329 "psrlw $2, %%mm0\n\t" | |
330 "psrlw $2, %%mm2\n\t" | |
331 "packuswb %%mm2, %%mm0\n\t" | |
332 "movq %%mm0, %0\n\t" | |
333 :"=m"(*p) | |
334 :"m"(*pix), | |
335 "m"(*(pix+line_size)) | |
336 :"memory"); | |
337 pix += line_size; | |
338 p += line_size; | |
339 } while(--h); | |
340 } | |
341 | |
342 static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
343 { | |
344 UINT8 *p; | |
345 const UINT8 *pix; | |
346 p = block; | |
347 pix = pixels; | |
348 __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory"); | |
349 do { | |
350 __asm __volatile( | |
351 "movq %1, %%mm0\n\t" | |
352 "movq 1%1, %%mm1\n\t" | |
353 "movq %%mm0, %%mm2\n\t" | |
354 "movq %%mm1, %%mm3\n\t" | |
355 "punpcklbw %%mm7, %%mm0\n\t" | |
356 "punpcklbw %%mm7, %%mm1\n\t" | |
357 "punpckhbw %%mm7, %%mm2\n\t" | |
358 "punpckhbw %%mm7, %%mm3\n\t" | |
359 "paddusw %%mm1, %%mm0\n\t" | |
360 "paddusw %%mm3, %%mm2\n\t" | |
361 "psrlw $1, %%mm0\n\t" | |
362 "psrlw $1, %%mm2\n\t" | |
363 "packuswb %%mm2, %%mm0\n\t" | |
364 "movq %%mm0, %0\n\t" | |
365 :"=m"(*p) | |
366 :"m"(*pix) | |
367 :"memory"); | |
368 pix += line_size; | |
369 p += line_size; | |
370 } while (--h); | |
371 } | |
372 | |
373 static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
374 { | |
375 UINT8 *p; | |
376 const UINT8 *pix; | |
377 p = block; | |
378 pix = pixels; | |
379 __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory"); | |
380 do { | |
381 __asm __volatile( | |
382 "movq %1, %%mm0\n\t" | |
383 "movq %2, %%mm1\n\t" | |
384 "movq %%mm0, %%mm2\n\t" | |
385 "movq %%mm1, %%mm3\n\t" | |
386 "punpcklbw %%mm7, %%mm0\n\t" | |
387 "punpcklbw %%mm7, %%mm1\n\t" | |
388 "punpckhbw %%mm7, %%mm2\n\t" | |
389 "punpckhbw %%mm7, %%mm3\n\t" | |
390 "paddusw %%mm1, %%mm0\n\t" | |
391 "paddusw %%mm3, %%mm2\n\t" | |
392 "psrlw $1, %%mm0\n\t" | |
393 "psrlw $1, %%mm2\n\t" | |
394 "packuswb %%mm2, %%mm0\n\t" | |
395 "movq %%mm0, %0\n\t" | |
396 :"=m"(*p) | |
397 :"m"(*pix), | |
398 "m"(*(pix+line_size)) | |
399 :"memory"); | |
400 pix += line_size; | |
401 p += line_size; | |
402 } while(--h); | |
403 } | |
404 | |
405 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
406 { | |
407 UINT8 *p; | |
408 const UINT8 *pix; | |
409 p = block; | |
410 pix = pixels; | |
411 __asm __volatile( | |
412 "pxor %%mm7, %%mm7\n\t" | |
413 "movq %0, %%mm6\n\t" | |
8 | 414 ::"m"(mm_wone):"memory"); |
0 | 415 do { |
416 __asm __volatile( | |
417 "movq %1, %%mm0\n\t" | |
418 "movq %2, %%mm1\n\t" | |
419 "movq 1%1, %%mm4\n\t" | |
420 "movq 1%2, %%mm5\n\t" | |
421 "movq %%mm0, %%mm2\n\t" | |
422 "movq %%mm1, %%mm3\n\t" | |
423 "punpcklbw %%mm7, %%mm0\n\t" | |
424 "punpcklbw %%mm7, %%mm1\n\t" | |
425 "punpckhbw %%mm7, %%mm2\n\t" | |
426 "punpckhbw %%mm7, %%mm3\n\t" | |
427 "paddusw %%mm1, %%mm0\n\t" | |
428 "paddusw %%mm3, %%mm2\n\t" | |
429 "movq %%mm4, %%mm1\n\t" | |
430 "movq %%mm5, %%mm3\n\t" | |
431 "punpcklbw %%mm7, %%mm4\n\t" | |
432 "punpcklbw %%mm7, %%mm5\n\t" | |
433 "punpckhbw %%mm7, %%mm1\n\t" | |
434 "punpckhbw %%mm7, %%mm3\n\t" | |
435 "paddusw %%mm5, %%mm4\n\t" | |
436 "paddusw %%mm3, %%mm1\n\t" | |
437 "paddusw %%mm6, %%mm4\n\t" | |
438 "paddusw %%mm6, %%mm1\n\t" | |
439 "paddusw %%mm4, %%mm0\n\t" | |
440 "paddusw %%mm1, %%mm2\n\t" | |
441 "psrlw $2, %%mm0\n\t" | |
442 "psrlw $2, %%mm2\n\t" | |
443 "packuswb %%mm2, %%mm0\n\t" | |
444 "movq %%mm0, %0\n\t" | |
445 :"=m"(*p) | |
446 :"m"(*pix), | |
447 "m"(*(pix+line_size)) | |
448 :"memory"); | |
449 pix += line_size; | |
450 p += line_size; | |
451 } while(--h); | |
452 } | |
453 | |
454 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
455 { | |
456 UINT8 *p; | |
457 const UINT8 *pix; | |
458 p = block; | |
459 pix = pixels; | |
460 __asm __volatile( | |
461 "pxor %%mm7, %%mm7\n\t" | |
462 "movq %0, %%mm6\n\t" | |
8 | 463 ::"m"(mm_wone):"memory"); |
0 | 464 do { |
465 __asm __volatile( | |
466 "movq %0, %%mm0\n\t" | |
467 "movq %1, %%mm1\n\t" | |
468 "movq %%mm0, %%mm2\n\t" | |
469 "movq %%mm1, %%mm3\n\t" | |
470 "punpcklbw %%mm7, %%mm0\n\t" | |
471 "punpcklbw %%mm7, %%mm1\n\t" | |
472 "punpckhbw %%mm7, %%mm2\n\t" | |
473 "punpckhbw %%mm7, %%mm3\n\t" | |
474 "paddusw %%mm1, %%mm0\n\t" | |
475 "paddusw %%mm3, %%mm2\n\t" | |
476 "paddusw %%mm6, %%mm0\n\t" | |
477 "paddusw %%mm6, %%mm2\n\t" | |
478 "psrlw $1, %%mm0\n\t" | |
479 "psrlw $1, %%mm2\n\t" | |
480 "packuswb %%mm2, %%mm0\n\t" | |
481 "movq %%mm0, %0\n\t" | |
482 :"=m"(*p) | |
483 :"m"(*pix) | |
484 :"memory"); | |
485 pix += line_size; | |
486 p += line_size; | |
487 } | |
488 while (--h); | |
489 } | |
490 | |
491 static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
492 { | |
493 UINT8 *p; | |
494 const UINT8 *pix; | |
495 p = block; | |
496 pix = pixels; | |
497 __asm __volatile( | |
498 "pxor %%mm7, %%mm7\n\t" | |
499 "movq %0, %%mm6\n\t" | |
8 | 500 ::"m"(mm_wone):"memory"); |
0 | 501 do { |
502 __asm __volatile( | |
503 "movq %1, %%mm1\n\t" | |
504 "movq %0, %%mm0\n\t" | |
505 "movq 1%1, %%mm4\n\t" | |
506 "movq %%mm0, %%mm2\n\t" | |
507 "movq %%mm1, %%mm3\n\t" | |
508 "movq %%mm4, %%mm5\n\t" | |
509 "punpcklbw %%mm7, %%mm1\n\t" | |
510 "punpckhbw %%mm7, %%mm3\n\t" | |
511 "punpcklbw %%mm7, %%mm4\n\t" | |
512 "punpckhbw %%mm7, %%mm5\n\t" | |
513 "punpcklbw %%mm7, %%mm0\n\t" | |
514 "punpckhbw %%mm7, %%mm2\n\t" | |
515 "paddusw %%mm4, %%mm1\n\t" | |
516 "paddusw %%mm5, %%mm3\n\t" | |
517 "paddusw %%mm6, %%mm1\n\t" | |
518 "paddusw %%mm6, %%mm3\n\t" | |
519 "psrlw $1, %%mm1\n\t" | |
520 "psrlw $1, %%mm3\n\t" | |
521 "paddusw %%mm6, %%mm0\n\t" | |
522 "paddusw %%mm6, %%mm2\n\t" | |
523 "paddusw %%mm1, %%mm0\n\t" | |
524 "paddusw %%mm3, %%mm2\n\t" | |
525 "psrlw $1, %%mm0\n\t" | |
526 "psrlw $1, %%mm2\n\t" | |
527 "packuswb %%mm2, %%mm0\n\t" | |
528 "movq %%mm0, %0\n\t" | |
529 :"=m"(*p) | |
530 :"m"(*pix) | |
531 :"memory"); | |
532 pix += line_size; | |
533 p += line_size; | |
534 } while (--h); | |
535 } | |
536 | |
537 static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
538 { | |
539 UINT8 *p; | |
540 const UINT8 *pix; | |
541 p = block; | |
542 pix = pixels; | |
543 __asm __volatile( | |
544 "pxor %%mm7, %%mm7\n\t" | |
545 "movq %0, %%mm6\n\t" | |
8 | 546 ::"m"(mm_wone):"memory"); |
0 | 547 do { |
548 __asm __volatile( | |
549 "movq %1, %%mm1\n\t" | |
550 "movq %0, %%mm0\n\t" | |
551 "movq %2, %%mm4\n\t" | |
552 "movq %%mm0, %%mm2\n\t" | |
553 "movq %%mm1, %%mm3\n\t" | |
554 "movq %%mm4, %%mm5\n\t" | |
555 "punpcklbw %%mm7, %%mm1\n\t" | |
556 "punpckhbw %%mm7, %%mm3\n\t" | |
557 "punpcklbw %%mm7, %%mm4\n\t" | |
558 "punpckhbw %%mm7, %%mm5\n\t" | |
559 "punpcklbw %%mm7, %%mm0\n\t" | |
560 "punpckhbw %%mm7, %%mm2\n\t" | |
561 "paddusw %%mm4, %%mm1\n\t" | |
562 "paddusw %%mm5, %%mm3\n\t" | |
563 "paddusw %%mm6, %%mm1\n\t" | |
564 "paddusw %%mm6, %%mm3\n\t" | |
565 "psrlw $1, %%mm1\n\t" | |
566 "psrlw $1, %%mm3\n\t" | |
567 "paddusw %%mm6, %%mm0\n\t" | |
568 "paddusw %%mm6, %%mm2\n\t" | |
569 "paddusw %%mm1, %%mm0\n\t" | |
570 "paddusw %%mm3, %%mm2\n\t" | |
571 "psrlw $1, %%mm0\n\t" | |
572 "psrlw $1, %%mm2\n\t" | |
573 "packuswb %%mm2, %%mm0\n\t" | |
574 "movq %%mm0, %0\n\t" | |
575 :"=m"(*p) | |
576 :"m"(*pix), "m"(*(pix+line_size)) | |
577 :"memory"); | |
578 pix += line_size; | |
579 p += line_size ; | |
580 } while(--h); | |
581 } | |
582 | |
583 static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
584 { | |
585 UINT8 *p; | |
586 const UINT8 *pix; | |
587 p = block; | |
588 pix = pixels; | |
589 __asm __volatile( | |
590 "pxor %%mm7, %%mm7\n\t" | |
591 "movq %0, %%mm6\n\t" | |
8 | 592 ::"m"(mm_wtwo):"memory"); |
0 | 593 do { |
594 __asm __volatile( | |
595 "movq %1, %%mm0\n\t" | |
596 "movq %2, %%mm1\n\t" | |
597 "movq 1%1, %%mm4\n\t" | |
598 "movq 1%2, %%mm5\n\t" | |
599 "movq %%mm0, %%mm2\n\t" | |
600 "movq %%mm1, %%mm3\n\t" | |
601 "punpcklbw %%mm7, %%mm0\n\t" | |
602 "punpcklbw %%mm7, %%mm1\n\t" | |
603 "punpckhbw %%mm7, %%mm2\n\t" | |
604 "punpckhbw %%mm7, %%mm3\n\t" | |
605 "paddusw %%mm1, %%mm0\n\t" | |
606 "paddusw %%mm3, %%mm2\n\t" | |
607 "movq %%mm4, %%mm1\n\t" | |
608 "movq %%mm5, %%mm3\n\t" | |
609 "punpcklbw %%mm7, %%mm4\n\t" | |
610 "punpcklbw %%mm7, %%mm5\n\t" | |
611 "punpckhbw %%mm7, %%mm1\n\t" | |
612 "punpckhbw %%mm7, %%mm3\n\t" | |
613 "paddusw %%mm5, %%mm4\n\t" | |
614 "paddusw %%mm3, %%mm1\n\t" | |
615 "paddusw %%mm6, %%mm4\n\t" | |
616 "paddusw %%mm6, %%mm1\n\t" | |
617 "paddusw %%mm4, %%mm0\n\t" | |
618 "paddusw %%mm1, %%mm2\n\t" | |
619 "movq %3, %%mm5\n\t" | |
620 "psrlw $2, %%mm0\n\t" | |
621 "movq %0, %%mm1\n\t" | |
622 "psrlw $2, %%mm2\n\t" | |
623 "movq %%mm1, %%mm3\n\t" | |
624 "punpcklbw %%mm7, %%mm1\n\t" | |
625 "punpckhbw %%mm7, %%mm3\n\t" | |
626 "paddusw %%mm1, %%mm0\n\t" | |
627 "paddusw %%mm3, %%mm2\n\t" | |
628 "paddusw %%mm5, %%mm0\n\t" | |
629 "paddusw %%mm5, %%mm2\n\t" | |
630 "psrlw $1, %%mm0\n\t" | |
631 "psrlw $1, %%mm2\n\t" | |
632 "packuswb %%mm2, %%mm0\n\t" | |
633 "movq %%mm0, %0\n\t" | |
634 :"=m"(*p) | |
635 :"m"(*pix), | |
8 | 636 "m"(*(pix+line_size)), "m"(mm_wone) |
0 | 637 :"memory"); |
638 pix += line_size; | |
639 p += line_size ; | |
640 } while(--h); | |
641 } | |
642 | |
643 static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
644 { | |
645 UINT8 *p; | |
646 const UINT8 *pix; | |
647 p = block; | |
648 pix = pixels; | |
649 __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory"); | |
650 do { | |
651 __asm __volatile( | |
652 "movq %1, %%mm0\n\t" | |
653 "movq %0, %%mm1\n\t" | |
654 "movq %%mm0, %%mm2\n\t" | |
655 "movq %%mm1, %%mm3\n\t" | |
656 "punpcklbw %%mm7, %%mm0\n\t" | |
657 "punpcklbw %%mm7, %%mm1\n\t" | |
658 "punpckhbw %%mm7, %%mm2\n\t" | |
659 "punpckhbw %%mm7, %%mm3\n\t" | |
660 "paddusw %%mm1, %%mm0\n\t" | |
661 "paddusw %%mm3, %%mm2\n\t" | |
662 "psrlw $1, %%mm0\n\t" | |
663 "psrlw $1, %%mm2\n\t" | |
664 "packuswb %%mm2, %%mm0\n\t" | |
665 "movq %%mm0, %0\n\t" | |
666 :"=m"(*p) | |
667 :"m"(*pix) | |
668 :"memory"); | |
669 pix += line_size; | |
670 p += line_size ; | |
671 } while (--h); | |
672 } | |
673 | |
674 static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
675 { | |
676 UINT8 *p; | |
677 const UINT8 *pix; | |
678 p = block; | |
679 pix = pixels; | |
680 __asm __volatile( | |
681 "pxor %%mm7, %%mm7\n\t":::"memory"); | |
682 do { | |
683 __asm __volatile( | |
684 "movq %1, %%mm0\n\t" | |
685 "movq 1%1, %%mm1\n\t" | |
686 "movq %0, %%mm4\n\t" | |
687 "movq %%mm0, %%mm2\n\t" | |
688 "movq %%mm1, %%mm3\n\t" | |
689 "movq %%mm4, %%mm5\n\t" | |
690 "punpcklbw %%mm7, %%mm0\n\t" | |
691 "punpcklbw %%mm7, %%mm1\n\t" | |
692 "punpckhbw %%mm7, %%mm2\n\t" | |
693 "punpckhbw %%mm7, %%mm3\n\t" | |
694 "punpcklbw %%mm7, %%mm4\n\t" | |
695 "punpckhbw %%mm7, %%mm5\n\t" | |
696 "paddusw %%mm1, %%mm0\n\t" | |
697 "paddusw %%mm3, %%mm2\n\t" | |
698 "psrlw $1, %%mm0\n\t" | |
699 "psrlw $1, %%mm2\n\t" | |
700 "paddusw %%mm4, %%mm0\n\t" | |
701 "paddusw %%mm5, %%mm2\n\t" | |
702 "psrlw $1, %%mm0\n\t" | |
703 "psrlw $1, %%mm2\n\t" | |
704 "packuswb %%mm2, %%mm0\n\t" | |
705 "movq %%mm0, %0\n\t" | |
706 :"=m"(*p) | |
707 :"m"(*pix) | |
708 :"memory"); | |
709 pix += line_size; | |
710 p += line_size; | |
711 } while (--h); | |
712 } | |
713 | |
714 static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
715 { | |
716 UINT8 *p; | |
717 const UINT8 *pix; | |
718 p = block; | |
719 pix = pixels; | |
720 __asm __volatile( | |
721 "pxor %%mm7, %%mm7\n\t":::"memory"); | |
722 do { | |
723 __asm __volatile( | |
724 "movq %1, %%mm0\n\t" | |
725 "movq %2, %%mm1\n\t" | |
726 "movq %0, %%mm4\n\t" | |
727 "movq %%mm0, %%mm2\n\t" | |
728 "movq %%mm1, %%mm3\n\t" | |
729 "movq %%mm4, %%mm5\n\t" | |
730 "punpcklbw %%mm7, %%mm0\n\t" | |
731 "punpcklbw %%mm7, %%mm1\n\t" | |
732 "punpckhbw %%mm7, %%mm2\n\t" | |
733 "punpckhbw %%mm7, %%mm3\n\t" | |
734 "punpcklbw %%mm7, %%mm4\n\t" | |
735 "punpckhbw %%mm7, %%mm5\n\t" | |
736 "paddusw %%mm1, %%mm0\n\t" | |
737 "paddusw %%mm3, %%mm2\n\t" | |
738 "psrlw $1, %%mm0\n\t" | |
739 "psrlw $1, %%mm2\n\t" | |
740 "paddusw %%mm4, %%mm0\n\t" | |
741 "paddusw %%mm5, %%mm2\n\t" | |
742 "psrlw $1, %%mm0\n\t" | |
743 "psrlw $1, %%mm2\n\t" | |
744 "packuswb %%mm2, %%mm0\n\t" | |
745 "movq %%mm0, %0\n\t" | |
746 :"=m"(*p) | |
747 :"m"(*pix), "m"(*(pix+line_size)) | |
748 :"memory"); | |
749 pix += line_size; | |
750 p += line_size ; | |
751 } while(--h); | |
752 } | |
753 | |
754 static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
755 { | |
756 UINT8 *p; | |
757 const UINT8 *pix; | |
758 p = block; | |
759 pix = pixels; | |
760 __asm __volatile( | |
761 "pxor %%mm7, %%mm7\n\t" | |
762 "movq %0, %%mm6\n\t" | |
8 | 763 ::"m"(mm_wone):"memory"); |
0 | 764 do { |
765 __asm __volatile( | |
766 "movq %1, %%mm0\n\t" | |
767 "movq %2, %%mm1\n\t" | |
768 "movq 1%1, %%mm4\n\t" | |
769 "movq 1%2, %%mm5\n\t" | |
770 "movq %%mm0, %%mm2\n\t" | |
771 "movq %%mm1, %%mm3\n\t" | |
772 "punpcklbw %%mm7, %%mm0\n\t" | |
773 "punpcklbw %%mm7, %%mm1\n\t" | |
774 "punpckhbw %%mm7, %%mm2\n\t" | |
775 "punpckhbw %%mm7, %%mm3\n\t" | |
776 "paddusw %%mm1, %%mm0\n\t" | |
777 "paddusw %%mm3, %%mm2\n\t" | |
778 "movq %%mm4, %%mm1\n\t" | |
779 "movq %%mm5, %%mm3\n\t" | |
780 "punpcklbw %%mm7, %%mm4\n\t" | |
781 "punpcklbw %%mm7, %%mm5\n\t" | |
782 "punpckhbw %%mm7, %%mm1\n\t" | |
783 "punpckhbw %%mm7, %%mm3\n\t" | |
784 "paddusw %%mm5, %%mm4\n\t" | |
785 "paddusw %%mm3, %%mm1\n\t" | |
786 "paddusw %%mm6, %%mm4\n\t" | |
787 "paddusw %%mm6, %%mm1\n\t" | |
788 "paddusw %%mm4, %%mm0\n\t" | |
789 "paddusw %%mm1, %%mm2\n\t" | |
790 "movq %0, %%mm1\n\t" | |
791 "psrlw $2, %%mm0\n\t" | |
792 "movq %%mm1, %%mm3\n\t" | |
793 "psrlw $2, %%mm2\n\t" | |
794 "punpcklbw %%mm7, %%mm1\n\t" | |
795 "punpckhbw %%mm7, %%mm3\n\t" | |
796 "paddusw %%mm1, %%mm0\n\t" | |
797 "paddusw %%mm3, %%mm2\n\t" | |
798 "psrlw $1, %%mm0\n\t" | |
799 "psrlw $1, %%mm2\n\t" | |
800 "packuswb %%mm2, %%mm0\n\t" | |
801 "movq %%mm0, %0\n\t" | |
802 :"=m"(*p) | |
803 :"m"(*pix), | |
804 "m"(*(pix+line_size)) | |
805 :"memory"); | |
806 pix += line_size; | |
807 p += line_size; | |
808 } while(--h); | |
809 } | |
810 | |
811 static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
812 { | |
813 DCTELEM *p; | |
814 const UINT8 *pix; | |
815 p = block; | |
816 pix = pixels; | |
817 __asm __volatile("pxor %%mm7, %%mm7":::"memory"); | |
818 do { | |
819 __asm __volatile( | |
820 "movq %0, %%mm0\n\t" | |
821 "movq %1, %%mm2\n\t" | |
822 "movq 8%0, %%mm1\n\t" | |
823 "movq %%mm2, %%mm3\n\t" | |
824 "punpcklbw %%mm7, %%mm2\n\t" | |
825 "punpckhbw %%mm7, %%mm3\n\t" | |
826 "psubsw %%mm2, %%mm0\n\t" | |
827 "psubsw %%mm3, %%mm1\n\t" | |
828 "movq %%mm0, %0\n\t" | |
829 "movq %%mm1, 8%0\n\t" | |
830 :"=m"(*p) | |
831 :"m"(*pix) | |
832 :"memory"); | |
833 pix += line_size; | |
834 p += 8; | |
835 } while (--h); | |
836 } | |
837 | |
838 static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
839 { | |
840 DCTELEM *p; | |
841 const UINT8 *pix; | |
842 p = block; | |
843 pix = pixels; | |
844 __asm __volatile( | |
845 "pxor %%mm7, %%mm7\n\t" | |
846 "movq %0, %%mm6" | |
8 | 847 ::"m"(mm_wone):"memory"); |
0 | 848 do { |
849 __asm __volatile( | |
850 "movq %0, %%mm0\n\t" | |
851 "movq %1, %%mm2\n\t" | |
852 "movq 8%0, %%mm1\n\t" | |
853 "movq 1%1, %%mm4\n\t" | |
854 "movq %%mm2, %%mm3\n\t" | |
855 "movq %%mm4, %%mm5\n\t" | |
856 "punpcklbw %%mm7, %%mm2\n\t" | |
857 "punpckhbw %%mm7, %%mm3\n\t" | |
858 "punpcklbw %%mm7, %%mm4\n\t" | |
859 "punpckhbw %%mm7, %%mm5\n\t" | |
860 "paddusw %%mm4, %%mm2\n\t" | |
861 "paddusw %%mm5, %%mm3\n\t" | |
862 "paddusw %%mm6, %%mm2\n\t" | |
863 "paddusw %%mm6, %%mm3\n\t" | |
864 "psrlw $1, %%mm2\n\t" | |
865 "psrlw $1, %%mm3\n\t" | |
866 "psubsw %%mm2, %%mm0\n\t" | |
867 "psubsw %%mm3, %%mm1\n\t" | |
868 "movq %%mm0, %0\n\t" | |
869 "movq %%mm1, 8%0\n\t" | |
870 :"=m"(*p) | |
871 :"m"(*pix) | |
872 :"memory"); | |
873 pix += line_size; | |
874 p += 8; | |
875 } while (--h); | |
876 } | |
877 | |
878 static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
879 { | |
880 DCTELEM *p; | |
881 const UINT8 *pix; | |
882 p = block; | |
883 pix = pixels; | |
884 __asm __volatile( | |
885 "pxor %%mm7, %%mm7\n\t" | |
886 "movq %0, %%mm6" | |
8 | 887 ::"m"(mm_wone):"memory"); |
0 | 888 do { |
889 __asm __volatile( | |
890 "movq %0, %%mm0\n\t" | |
891 "movq %1, %%mm2\n\t" | |
892 "movq 8%0, %%mm1\n\t" | |
893 "movq %2, %%mm4\n\t" | |
894 "movq %%mm2, %%mm3\n\t" | |
895 "movq %%mm4, %%mm5\n\t" | |
896 "punpcklbw %%mm7, %%mm2\n\t" | |
897 "punpckhbw %%mm7, %%mm3\n\t" | |
898 "punpcklbw %%mm7, %%mm4\n\t" | |
899 "punpckhbw %%mm7, %%mm5\n\t" | |
900 "paddusw %%mm4, %%mm2\n\t" | |
901 "paddusw %%mm5, %%mm3\n\t" | |
902 "paddusw %%mm6, %%mm2\n\t" | |
903 "paddusw %%mm6, %%mm3\n\t" | |
904 "psrlw $1, %%mm2\n\t" | |
905 "psrlw $1, %%mm3\n\t" | |
906 "psubsw %%mm2, %%mm0\n\t" | |
907 "psubsw %%mm3, %%mm1\n\t" | |
908 "movq %%mm0, %0\n\t" | |
909 "movq %%mm1, 8%0\n\t" | |
910 :"=m"(*p) | |
911 :"m"(*pix), "m"(*(pix+line_size)) | |
912 :"memory"); | |
913 pix += line_size; | |
914 p += 8; | |
915 } while (--h); | |
916 } | |
917 | |
918 static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
919 { | |
920 DCTELEM *p; | |
921 const UINT8 *pix; | |
922 p = block; | |
923 pix = pixels; | |
924 __asm __volatile( | |
925 "pxor %%mm7, %%mm7\n\t" | |
926 "movq %0, %%mm6\n\t" | |
8 | 927 ::"m"(mm_wtwo):"memory"); |
0 | 928 do { |
929 __asm __volatile( | |
930 "movq %1, %%mm0\n\t" | |
931 "movq %2, %%mm1\n\t" | |
932 "movq 1%1, %%mm4\n\t" | |
933 "movq 1%2, %%mm5\n\t" | |
934 "movq %%mm0, %%mm2\n\t" | |
935 "movq %%mm1, %%mm3\n\t" | |
936 "punpcklbw %%mm7, %%mm0\n\t" | |
937 "punpcklbw %%mm7, %%mm1\n\t" | |
938 "punpckhbw %%mm7, %%mm2\n\t" | |
939 "punpckhbw %%mm7, %%mm3\n\t" | |
940 "paddusw %%mm1, %%mm0\n\t" | |
941 "paddusw %%mm3, %%mm2\n\t" | |
942 "movq %%mm4, %%mm1\n\t" | |
943 "movq %%mm5, %%mm3\n\t" | |
944 "punpcklbw %%mm7, %%mm4\n\t" | |
945 "punpcklbw %%mm7, %%mm5\n\t" | |
946 "punpckhbw %%mm7, %%mm1\n\t" | |
947 "punpckhbw %%mm7, %%mm3\n\t" | |
948 "paddusw %%mm5, %%mm4\n\t" | |
949 "paddusw %%mm3, %%mm1\n\t" | |
950 "paddusw %%mm6, %%mm4\n\t" | |
951 "paddusw %%mm6, %%mm1\n\t" | |
952 "paddusw %%mm4, %%mm0\n\t" | |
953 "paddusw %%mm1, %%mm2\n\t" | |
954 "movq %0, %%mm1\n\t" | |
955 "movq 8%0, %%mm3\n\t" | |
956 "psrlw $2, %%mm0\n\t" | |
957 "psrlw $2, %%mm2\n\t" | |
958 "psubsw %%mm0, %%mm1\n\t" | |
959 "psubsw %%mm2, %%mm3\n\t" | |
960 "movq %%mm1, %0\n\t" | |
961 "movq %%mm3, 8%0\n\t" | |
962 :"=m"(*p) | |
963 :"m"(*pix), | |
964 "m"(*(pix+line_size)) | |
965 :"memory"); | |
966 pix += line_size; | |
967 p += 8 ; | |
968 } while(--h); | |
969 } | |
970 | |
971 void dsputil_init_mmx(void) | |
972 { | |
973 mm_flags = mm_support(); | |
974 #if 0 | |
975 printf("CPU flags:"); | |
976 if (mm_flags & MM_MMX) | |
977 printf(" mmx"); | |
978 if (mm_flags & MM_MMXEXT) | |
979 printf(" mmxext"); | |
980 if (mm_flags & MM_3DNOW) | |
981 printf(" 3dnow"); | |
982 if (mm_flags & MM_SSE) | |
983 printf(" sse"); | |
984 if (mm_flags & MM_SSE2) | |
985 printf(" sse2"); | |
986 printf("\n"); | |
987 #endif | |
988 | |
989 if (mm_flags & MM_MMX) { | |
990 get_pixels = get_pixels_mmx; | |
991 put_pixels_clamped = put_pixels_clamped_mmx; | |
992 add_pixels_clamped = add_pixels_clamped_mmx; | |
993 | |
994 pix_abs16x16 = pix_abs16x16_mmx; | |
995 pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
996 pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
997 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; | |
998 av_fdct = fdct_mmx; | |
999 | |
1000 put_pixels_tab[0] = put_pixels_mmx; | |
1001 put_pixels_tab[1] = put_pixels_x2_mmx; | |
1002 put_pixels_tab[2] = put_pixels_y2_mmx; | |
1003 put_pixels_tab[3] = put_pixels_xy2_mmx; | |
1004 | |
1005 put_no_rnd_pixels_tab[0] = put_pixels_mmx; | |
1006 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
1007 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1008 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | |
1009 | |
1010 avg_pixels_tab[0] = avg_pixels_mmx; | |
1011 avg_pixels_tab[1] = avg_pixels_x2_mmx; | |
1012 avg_pixels_tab[2] = avg_pixels_y2_mmx; | |
1013 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1014 | |
1015 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | |
1016 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | |
1017 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | |
1018 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | |
1019 | |
1020 sub_pixels_tab[0] = sub_pixels_mmx; | |
1021 sub_pixels_tab[1] = sub_pixels_x2_mmx; | |
1022 sub_pixels_tab[2] = sub_pixels_y2_mmx; | |
1023 sub_pixels_tab[3] = sub_pixels_xy2_mmx; | |
1024 | |
1025 if (mm_flags & MM_MMXEXT) { | |
1026 pix_abs16x16 = pix_abs16x16_sse; | |
1027 } | |
1028 | |
1029 if (mm_flags & MM_SSE) { | |
1030 put_pixels_tab[1] = put_pixels_x2_sse; | |
1031 put_pixels_tab[2] = put_pixels_y2_sse; | |
1032 | |
1033 avg_pixels_tab[0] = avg_pixels_sse; | |
1034 avg_pixels_tab[1] = avg_pixels_x2_sse; | |
1035 avg_pixels_tab[2] = avg_pixels_y2_sse; | |
1036 avg_pixels_tab[3] = avg_pixels_xy2_sse; | |
1037 | |
1038 sub_pixels_tab[1] = sub_pixels_x2_sse; | |
1039 sub_pixels_tab[2] = sub_pixels_y2_sse; | |
1040 } else if (mm_flags & MM_3DNOW) { | |
1041 put_pixels_tab[1] = put_pixels_x2_3dnow; | |
1042 put_pixels_tab[2] = put_pixels_y2_3dnow; | |
1043 | |
1044 avg_pixels_tab[0] = avg_pixels_3dnow; | |
1045 avg_pixels_tab[1] = avg_pixels_x2_3dnow; | |
1046 avg_pixels_tab[2] = avg_pixels_y2_3dnow; | |
1047 avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | |
1048 | |
1049 sub_pixels_tab[1] = sub_pixels_x2_3dnow; | |
1050 sub_pixels_tab[2] = sub_pixels_y2_3dnow; | |
1051 } | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1052 |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1053 #ifdef USE_MMX_IDCT |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1054 /* use MMX / MMXEXT iDCT code from libmpeg2 */ |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1055 //printf("LIBAVCODEC: Using MMX%s iDCT code\n",(mm_flags & MM_MMXEXT)?"EXT":""); |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1056 ff_idct = (mm_flags & MM_MMXEXT) ? mmxext_idct : mmx_idct; |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1057 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */ |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1058 { int i,j; |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1059 for (i = 0; i < 64; i++) { |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1060 j = zigzag_direct[i]; |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1061 zigzag_direct[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1062 j = ff_alternate_horizontal_scan[i]; |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1063 ff_alternate_horizontal_scan[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1064 j = ff_alternate_vertical_scan[i]; |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1065 ff_alternate_vertical_scan[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1066 } |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1067 } |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1068 #endif |
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1069 |
0 | 1070 } |
1071 } |