Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 188:5d56c2f7e712 libavcodec
print cpu flags
author | uid46427 |
---|---|
date | Thu, 10 Jan 2002 00:41:53 +0000 |
parents | ac5075a55488 |
children | 6f48cacd9ed9 |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
3 * Copyright (c) 2000, 2001 Gerard Lantau. | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
18 * | |
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 */ | |
21 | |
22 #include "../dsputil.h" | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
23 #include "../simple_idct.h" |
0 | 24 |
5 | 25 int mm_flags; /* multimedia extension flags */ |
26 | |
0 | 27 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); |
28 int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
29 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
30 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
31 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
32 | |
42 | 33 /* external functions, from idct_mmx.c */ |
34 void ff_mmx_idct(DCTELEM *block); | |
35 void ff_mmxext_idct(DCTELEM *block); | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
36 |
0 | 37 /* pixel operations */ |
8 | 38 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; |
39 static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; | |
40 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; | |
41 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |
0 | 42 |
43 /***********************************/ | |
44 /* 3Dnow specific */ | |
45 | |
46 #define DEF(x) x ## _3dnow | |
47 /* for Athlons PAVGUSB is prefered */ | |
48 #define PAVGB "pavgusb" | |
49 | |
50 #include "dsputil_mmx_avg.h" | |
51 | |
52 #undef DEF | |
53 #undef PAVGB | |
54 | |
55 /***********************************/ | |
56 /* MMX2 specific */ | |
57 | |
58 #define DEF(x) x ## _sse | |
59 | |
60 /* Introduced only in MMX2 set */ | |
61 #define PAVGB "pavgb" | |
62 | |
63 #include "dsputil_mmx_avg.h" | |
64 | |
65 #undef DEF | |
66 #undef PAVGB | |
67 | |
68 /***********************************/ | |
69 /* standard MMX */ | |
70 | |
71 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
72 { | |
73 DCTELEM *p; | |
74 const UINT8 *pix; | |
75 int i; | |
76 | |
77 /* read the pixels */ | |
78 p = block; | |
79 pix = pixels; | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
80 __asm __volatile("pxor %%mm7, %%mm7":); |
0 | 81 for(i=0;i<4;i++) { |
82 __asm __volatile( | |
83 "movq %1, %%mm0\n\t" | |
84 "movq %2, %%mm1\n\t" | |
85 "movq %%mm0, %%mm2\n\t" | |
86 "movq %%mm1, %%mm3\n\t" | |
87 "punpcklbw %%mm7, %%mm0\n\t" | |
88 "punpckhbw %%mm7, %%mm2\n\t" | |
89 "punpcklbw %%mm7, %%mm1\n\t" | |
90 "punpckhbw %%mm7, %%mm3\n\t" | |
91 "movq %%mm0, %0\n\t" | |
92 "movq %%mm2, 8%0\n\t" | |
93 "movq %%mm1, 16%0\n\t" | |
94 "movq %%mm3, 24%0\n\t" | |
95 :"=m"(*p) | |
96 :"m"(*pix), "m"(*(pix+line_size)) | |
97 :"memory"); | |
98 pix += line_size*2; | |
99 p += 16; | |
100 } | |
101 } | |
102 | |
103 static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
104 { | |
105 const DCTELEM *p; | |
106 UINT8 *pix; | |
107 int i; | |
108 | |
109 /* read the pixels */ | |
110 p = block; | |
111 pix = pixels; | |
112 for(i=0;i<2;i++) { | |
113 __asm __volatile( | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
114 "movq %3, %%mm0\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
115 "movq 8%3, %%mm1\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
116 "movq 16%3, %%mm2\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
117 "movq 24%3, %%mm3\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
118 "movq 32%3, %%mm4\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
119 "movq 40%3, %%mm5\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
120 "movq 48%3, %%mm6\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
121 "movq 56%3, %%mm7\n\t" |
0 | 122 "packuswb %%mm1, %%mm0\n\t" |
123 "packuswb %%mm3, %%mm2\n\t" | |
124 "packuswb %%mm5, %%mm4\n\t" | |
125 "packuswb %%mm7, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
126 "movq %%mm0, (%0)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
127 "movq %%mm2, (%0, %1)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
128 "movq %%mm4, (%0, %1, 2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
129 "movq %%mm6, (%0, %2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
130 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) |
0 | 131 :"memory"); |
132 pix += line_size*4; | |
133 p += 32; | |
134 } | |
135 } | |
136 | |
137 static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
138 { | |
139 const DCTELEM *p; | |
140 UINT8 *pix; | |
141 int i; | |
142 | |
143 /* read the pixels */ | |
144 p = block; | |
145 pix = pixels; | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
146 __asm __volatile("pxor %%mm7, %%mm7":); |
0 | 147 for(i=0;i<4;i++) { |
148 __asm __volatile( | |
149 "movq %2, %%mm0\n\t" | |
150 "movq 8%2, %%mm1\n\t" | |
151 "movq 16%2, %%mm2\n\t" | |
152 "movq 24%2, %%mm3\n\t" | |
153 "movq %0, %%mm4\n\t" | |
154 "movq %1, %%mm6\n\t" | |
155 "movq %%mm4, %%mm5\n\t" | |
156 "punpcklbw %%mm7, %%mm4\n\t" | |
157 "punpckhbw %%mm7, %%mm5\n\t" | |
158 "paddsw %%mm4, %%mm0\n\t" | |
159 "paddsw %%mm5, %%mm1\n\t" | |
160 "movq %%mm6, %%mm5\n\t" | |
161 "punpcklbw %%mm7, %%mm6\n\t" | |
162 "punpckhbw %%mm7, %%mm5\n\t" | |
163 "paddsw %%mm6, %%mm2\n\t" | |
164 "paddsw %%mm5, %%mm3\n\t" | |
165 "packuswb %%mm1, %%mm0\n\t" | |
166 "packuswb %%mm3, %%mm2\n\t" | |
167 "movq %%mm0, %0\n\t" | |
168 "movq %%mm2, %1\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
169 :"+m"(*pix), "+m"(*(pix+line_size)) |
0 | 170 :"m"(*p) |
171 :"memory"); | |
172 pix += line_size*2; | |
173 p += 16; | |
174 } | |
175 } | |
176 | |
177 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
178 { | |
179 int dh, hh; | |
180 UINT8 *p; | |
181 const UINT8 *pix; | |
182 p = block; | |
183 pix = pixels; | |
184 hh=h>>2; | |
185 dh=h&3; | |
186 while(hh--) { | |
187 __asm __volatile( | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
188 "movq (%1), %%mm0 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
189 "movq (%1, %2), %%mm1 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
190 "movq (%1, %2, 2), %%mm2 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
191 "movq (%1, %3), %%mm3 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
192 "movq %%mm0, (%0) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
193 "movq %%mm1, (%0, %2) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
194 "movq %%mm2, (%0, %2, 2) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
195 "movq %%mm3, (%0, %3) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
196 ::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3) |
0 | 197 :"memory"); |
198 pix = pix + line_size*4; | |
199 p = p + line_size*4; | |
200 } | |
201 while(dh--) { | |
202 __asm __volatile( | |
203 "movq %1, %%mm0\n\t" | |
204 "movq %%mm0, %0\n\t" | |
205 :"=m"(*p) | |
206 :"m"(*pix) | |
207 :"memory"); | |
208 pix = pix + line_size; | |
209 p = p + line_size; | |
210 } | |
211 } | |
212 | |
213 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
214 { | |
215 UINT8 *p; | |
216 const UINT8 *pix; | |
217 p = block; | |
218 pix = pixels; | |
219 __asm __volatile( | |
220 "pxor %%mm7, %%mm7\n\t" | |
221 "movq %0, %%mm4\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
222 ::"m"(mm_wone)); |
0 | 223 do { |
224 __asm __volatile( | |
225 "movq %1, %%mm0\n\t" | |
226 "movq 1%1, %%mm1\n\t" | |
227 "movq %%mm0, %%mm2\n\t" | |
228 "movq %%mm1, %%mm3\n\t" | |
229 "punpcklbw %%mm7, %%mm0\n\t" | |
230 "punpcklbw %%mm7, %%mm1\n\t" | |
231 "punpckhbw %%mm7, %%mm2\n\t" | |
232 "punpckhbw %%mm7, %%mm3\n\t" | |
233 "paddusw %%mm1, %%mm0\n\t" | |
234 "paddusw %%mm3, %%mm2\n\t" | |
235 "paddusw %%mm4, %%mm0\n\t" | |
236 "paddusw %%mm4, %%mm2\n\t" | |
237 "psrlw $1, %%mm0\n\t" | |
238 "psrlw $1, %%mm2\n\t" | |
239 "packuswb %%mm2, %%mm0\n\t" | |
240 "movq %%mm0, %0\n\t" | |
241 :"=m"(*p) | |
242 :"m"(*pix) | |
243 :"memory"); | |
244 pix += line_size; p += line_size; | |
245 } while (--h); | |
246 } | |
247 | |
248 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
249 { | |
250 UINT8 *p; | |
251 const UINT8 *pix; | |
252 p = block; | |
253 pix = pixels; | |
254 __asm __volatile( | |
255 "pxor %%mm7, %%mm7\n\t" | |
256 "movq %0, %%mm4\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
257 ::"m"(mm_wone)); |
0 | 258 do { |
259 __asm __volatile( | |
260 "movq %1, %%mm0\n\t" | |
261 "movq %2, %%mm1\n\t" | |
262 "movq %%mm0, %%mm2\n\t" | |
263 "movq %%mm1, %%mm3\n\t" | |
264 "punpcklbw %%mm7, %%mm0\n\t" | |
265 "punpcklbw %%mm7, %%mm1\n\t" | |
266 "punpckhbw %%mm7, %%mm2\n\t" | |
267 "punpckhbw %%mm7, %%mm3\n\t" | |
268 "paddusw %%mm1, %%mm0\n\t" | |
269 "paddusw %%mm3, %%mm2\n\t" | |
270 "paddusw %%mm4, %%mm0\n\t" | |
271 "paddusw %%mm4, %%mm2\n\t" | |
272 "psrlw $1, %%mm0\n\t" | |
273 "psrlw $1, %%mm2\n\t" | |
274 "packuswb %%mm2, %%mm0\n\t" | |
275 "movq %%mm0, %0\n\t" | |
276 :"=m"(*p) | |
277 :"m"(*pix), | |
278 "m"(*(pix+line_size)) | |
279 :"memory"); | |
280 pix += line_size; | |
281 p += line_size; | |
282 } while (--h); | |
283 } | |
284 | |
285 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
286 { | |
287 UINT8 *p; | |
288 const UINT8 *pix; | |
289 p = block; | |
290 pix = pixels; | |
291 __asm __volatile( | |
292 "pxor %%mm7, %%mm7\n\t" | |
293 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
294 ::"m"(mm_wtwo)); |
0 | 295 do { |
296 __asm __volatile( | |
297 "movq %1, %%mm0\n\t" | |
298 "movq %2, %%mm1\n\t" | |
299 "movq 1%1, %%mm4\n\t" | |
300 "movq 1%2, %%mm5\n\t" | |
301 "movq %%mm0, %%mm2\n\t" | |
302 "movq %%mm1, %%mm3\n\t" | |
303 "punpcklbw %%mm7, %%mm0\n\t" | |
304 "punpcklbw %%mm7, %%mm1\n\t" | |
305 "punpckhbw %%mm7, %%mm2\n\t" | |
306 "punpckhbw %%mm7, %%mm3\n\t" | |
307 "paddusw %%mm1, %%mm0\n\t" | |
308 "paddusw %%mm3, %%mm2\n\t" | |
309 "movq %%mm4, %%mm1\n\t" | |
310 "movq %%mm5, %%mm3\n\t" | |
311 "punpcklbw %%mm7, %%mm4\n\t" | |
312 "punpcklbw %%mm7, %%mm5\n\t" | |
313 "punpckhbw %%mm7, %%mm1\n\t" | |
314 "punpckhbw %%mm7, %%mm3\n\t" | |
315 "paddusw %%mm5, %%mm4\n\t" | |
316 "paddusw %%mm3, %%mm1\n\t" | |
317 "paddusw %%mm6, %%mm4\n\t" | |
318 "paddusw %%mm6, %%mm1\n\t" | |
319 "paddusw %%mm4, %%mm0\n\t" | |
320 "paddusw %%mm1, %%mm2\n\t" | |
321 "psrlw $2, %%mm0\n\t" | |
322 "psrlw $2, %%mm2\n\t" | |
323 "packuswb %%mm2, %%mm0\n\t" | |
324 "movq %%mm0, %0\n\t" | |
325 :"=m"(*p) | |
326 :"m"(*pix), | |
327 "m"(*(pix+line_size)) | |
328 :"memory"); | |
329 pix += line_size; | |
330 p += line_size; | |
331 } while(--h); | |
332 } | |
333 | |
334 static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
335 { | |
336 UINT8 *p; | |
337 const UINT8 *pix; | |
338 p = block; | |
339 pix = pixels; | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
340 __asm __volatile("pxor %%mm7, %%mm7\n\t":); |
0 | 341 do { |
342 __asm __volatile( | |
343 "movq %1, %%mm0\n\t" | |
344 "movq 1%1, %%mm1\n\t" | |
345 "movq %%mm0, %%mm2\n\t" | |
346 "movq %%mm1, %%mm3\n\t" | |
347 "punpcklbw %%mm7, %%mm0\n\t" | |
348 "punpcklbw %%mm7, %%mm1\n\t" | |
349 "punpckhbw %%mm7, %%mm2\n\t" | |
350 "punpckhbw %%mm7, %%mm3\n\t" | |
351 "paddusw %%mm1, %%mm0\n\t" | |
352 "paddusw %%mm3, %%mm2\n\t" | |
353 "psrlw $1, %%mm0\n\t" | |
354 "psrlw $1, %%mm2\n\t" | |
355 "packuswb %%mm2, %%mm0\n\t" | |
356 "movq %%mm0, %0\n\t" | |
357 :"=m"(*p) | |
358 :"m"(*pix) | |
359 :"memory"); | |
360 pix += line_size; | |
361 p += line_size; | |
362 } while (--h); | |
363 } | |
364 | |
365 static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
366 { | |
367 UINT8 *p; | |
368 const UINT8 *pix; | |
369 p = block; | |
370 pix = pixels; | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
371 __asm __volatile("pxor %%mm7, %%mm7\n\t":); |
0 | 372 do { |
373 __asm __volatile( | |
374 "movq %1, %%mm0\n\t" | |
375 "movq %2, %%mm1\n\t" | |
376 "movq %%mm0, %%mm2\n\t" | |
377 "movq %%mm1, %%mm3\n\t" | |
378 "punpcklbw %%mm7, %%mm0\n\t" | |
379 "punpcklbw %%mm7, %%mm1\n\t" | |
380 "punpckhbw %%mm7, %%mm2\n\t" | |
381 "punpckhbw %%mm7, %%mm3\n\t" | |
382 "paddusw %%mm1, %%mm0\n\t" | |
383 "paddusw %%mm3, %%mm2\n\t" | |
384 "psrlw $1, %%mm0\n\t" | |
385 "psrlw $1, %%mm2\n\t" | |
386 "packuswb %%mm2, %%mm0\n\t" | |
387 "movq %%mm0, %0\n\t" | |
388 :"=m"(*p) | |
389 :"m"(*pix), | |
390 "m"(*(pix+line_size)) | |
391 :"memory"); | |
392 pix += line_size; | |
393 p += line_size; | |
394 } while(--h); | |
395 } | |
396 | |
397 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
398 { | |
399 UINT8 *p; | |
400 const UINT8 *pix; | |
401 p = block; | |
402 pix = pixels; | |
403 __asm __volatile( | |
404 "pxor %%mm7, %%mm7\n\t" | |
405 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
406 ::"m"(mm_wone)); |
0 | 407 do { |
408 __asm __volatile( | |
409 "movq %1, %%mm0\n\t" | |
410 "movq %2, %%mm1\n\t" | |
411 "movq 1%1, %%mm4\n\t" | |
412 "movq 1%2, %%mm5\n\t" | |
413 "movq %%mm0, %%mm2\n\t" | |
414 "movq %%mm1, %%mm3\n\t" | |
415 "punpcklbw %%mm7, %%mm0\n\t" | |
416 "punpcklbw %%mm7, %%mm1\n\t" | |
417 "punpckhbw %%mm7, %%mm2\n\t" | |
418 "punpckhbw %%mm7, %%mm3\n\t" | |
419 "paddusw %%mm1, %%mm0\n\t" | |
420 "paddusw %%mm3, %%mm2\n\t" | |
421 "movq %%mm4, %%mm1\n\t" | |
422 "movq %%mm5, %%mm3\n\t" | |
423 "punpcklbw %%mm7, %%mm4\n\t" | |
424 "punpcklbw %%mm7, %%mm5\n\t" | |
425 "punpckhbw %%mm7, %%mm1\n\t" | |
426 "punpckhbw %%mm7, %%mm3\n\t" | |
427 "paddusw %%mm5, %%mm4\n\t" | |
428 "paddusw %%mm3, %%mm1\n\t" | |
429 "paddusw %%mm6, %%mm4\n\t" | |
430 "paddusw %%mm6, %%mm1\n\t" | |
431 "paddusw %%mm4, %%mm0\n\t" | |
432 "paddusw %%mm1, %%mm2\n\t" | |
433 "psrlw $2, %%mm0\n\t" | |
434 "psrlw $2, %%mm2\n\t" | |
435 "packuswb %%mm2, %%mm0\n\t" | |
436 "movq %%mm0, %0\n\t" | |
437 :"=m"(*p) | |
438 :"m"(*pix), | |
439 "m"(*(pix+line_size)) | |
440 :"memory"); | |
441 pix += line_size; | |
442 p += line_size; | |
443 } while(--h); | |
444 } | |
445 | |
446 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
447 { | |
448 UINT8 *p; | |
449 const UINT8 *pix; | |
450 p = block; | |
451 pix = pixels; | |
452 __asm __volatile( | |
453 "pxor %%mm7, %%mm7\n\t" | |
454 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
455 ::"m"(mm_wone)); |
0 | 456 do { |
457 __asm __volatile( | |
458 "movq %0, %%mm0\n\t" | |
459 "movq %1, %%mm1\n\t" | |
460 "movq %%mm0, %%mm2\n\t" | |
461 "movq %%mm1, %%mm3\n\t" | |
462 "punpcklbw %%mm7, %%mm0\n\t" | |
463 "punpcklbw %%mm7, %%mm1\n\t" | |
464 "punpckhbw %%mm7, %%mm2\n\t" | |
465 "punpckhbw %%mm7, %%mm3\n\t" | |
466 "paddusw %%mm1, %%mm0\n\t" | |
467 "paddusw %%mm3, %%mm2\n\t" | |
468 "paddusw %%mm6, %%mm0\n\t" | |
469 "paddusw %%mm6, %%mm2\n\t" | |
470 "psrlw $1, %%mm0\n\t" | |
471 "psrlw $1, %%mm2\n\t" | |
472 "packuswb %%mm2, %%mm0\n\t" | |
473 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
474 :"+m"(*p) |
0 | 475 :"m"(*pix) |
476 :"memory"); | |
477 pix += line_size; | |
478 p += line_size; | |
479 } | |
480 while (--h); | |
481 } | |
482 | |
483 static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
484 { | |
485 UINT8 *p; | |
486 const UINT8 *pix; | |
487 p = block; | |
488 pix = pixels; | |
489 __asm __volatile( | |
490 "pxor %%mm7, %%mm7\n\t" | |
491 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
492 ::"m"(mm_wone)); |
0 | 493 do { |
494 __asm __volatile( | |
495 "movq %1, %%mm1\n\t" | |
496 "movq %0, %%mm0\n\t" | |
497 "movq 1%1, %%mm4\n\t" | |
498 "movq %%mm0, %%mm2\n\t" | |
499 "movq %%mm1, %%mm3\n\t" | |
500 "movq %%mm4, %%mm5\n\t" | |
501 "punpcklbw %%mm7, %%mm1\n\t" | |
502 "punpckhbw %%mm7, %%mm3\n\t" | |
503 "punpcklbw %%mm7, %%mm4\n\t" | |
504 "punpckhbw %%mm7, %%mm5\n\t" | |
505 "punpcklbw %%mm7, %%mm0\n\t" | |
506 "punpckhbw %%mm7, %%mm2\n\t" | |
507 "paddusw %%mm4, %%mm1\n\t" | |
508 "paddusw %%mm5, %%mm3\n\t" | |
509 "paddusw %%mm6, %%mm1\n\t" | |
510 "paddusw %%mm6, %%mm3\n\t" | |
511 "psrlw $1, %%mm1\n\t" | |
512 "psrlw $1, %%mm3\n\t" | |
513 "paddusw %%mm6, %%mm0\n\t" | |
514 "paddusw %%mm6, %%mm2\n\t" | |
515 "paddusw %%mm1, %%mm0\n\t" | |
516 "paddusw %%mm3, %%mm2\n\t" | |
517 "psrlw $1, %%mm0\n\t" | |
518 "psrlw $1, %%mm2\n\t" | |
519 "packuswb %%mm2, %%mm0\n\t" | |
520 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
521 :"+m"(*p) |
0 | 522 :"m"(*pix) |
523 :"memory"); | |
524 pix += line_size; | |
525 p += line_size; | |
526 } while (--h); | |
527 } | |
528 | |
529 static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
530 { | |
531 UINT8 *p; | |
532 const UINT8 *pix; | |
533 p = block; | |
534 pix = pixels; | |
535 __asm __volatile( | |
536 "pxor %%mm7, %%mm7\n\t" | |
537 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
538 ::"m"(mm_wone)); |
0 | 539 do { |
540 __asm __volatile( | |
541 "movq %1, %%mm1\n\t" | |
542 "movq %0, %%mm0\n\t" | |
543 "movq %2, %%mm4\n\t" | |
544 "movq %%mm0, %%mm2\n\t" | |
545 "movq %%mm1, %%mm3\n\t" | |
546 "movq %%mm4, %%mm5\n\t" | |
547 "punpcklbw %%mm7, %%mm1\n\t" | |
548 "punpckhbw %%mm7, %%mm3\n\t" | |
549 "punpcklbw %%mm7, %%mm4\n\t" | |
550 "punpckhbw %%mm7, %%mm5\n\t" | |
551 "punpcklbw %%mm7, %%mm0\n\t" | |
552 "punpckhbw %%mm7, %%mm2\n\t" | |
553 "paddusw %%mm4, %%mm1\n\t" | |
554 "paddusw %%mm5, %%mm3\n\t" | |
555 "paddusw %%mm6, %%mm1\n\t" | |
556 "paddusw %%mm6, %%mm3\n\t" | |
557 "psrlw $1, %%mm1\n\t" | |
558 "psrlw $1, %%mm3\n\t" | |
559 "paddusw %%mm6, %%mm0\n\t" | |
560 "paddusw %%mm6, %%mm2\n\t" | |
561 "paddusw %%mm1, %%mm0\n\t" | |
562 "paddusw %%mm3, %%mm2\n\t" | |
563 "psrlw $1, %%mm0\n\t" | |
564 "psrlw $1, %%mm2\n\t" | |
565 "packuswb %%mm2, %%mm0\n\t" | |
566 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
567 :"+m"(*p) |
0 | 568 :"m"(*pix), "m"(*(pix+line_size)) |
569 :"memory"); | |
570 pix += line_size; | |
571 p += line_size ; | |
572 } while(--h); | |
573 } | |
574 | |
575 static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
576 { | |
577 UINT8 *p; | |
578 const UINT8 *pix; | |
579 p = block; | |
580 pix = pixels; | |
581 __asm __volatile( | |
582 "pxor %%mm7, %%mm7\n\t" | |
583 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
584 ::"m"(mm_wtwo)); |
0 | 585 do { |
586 __asm __volatile( | |
587 "movq %1, %%mm0\n\t" | |
588 "movq %2, %%mm1\n\t" | |
589 "movq 1%1, %%mm4\n\t" | |
590 "movq 1%2, %%mm5\n\t" | |
591 "movq %%mm0, %%mm2\n\t" | |
592 "movq %%mm1, %%mm3\n\t" | |
593 "punpcklbw %%mm7, %%mm0\n\t" | |
594 "punpcklbw %%mm7, %%mm1\n\t" | |
595 "punpckhbw %%mm7, %%mm2\n\t" | |
596 "punpckhbw %%mm7, %%mm3\n\t" | |
597 "paddusw %%mm1, %%mm0\n\t" | |
598 "paddusw %%mm3, %%mm2\n\t" | |
599 "movq %%mm4, %%mm1\n\t" | |
600 "movq %%mm5, %%mm3\n\t" | |
601 "punpcklbw %%mm7, %%mm4\n\t" | |
602 "punpcklbw %%mm7, %%mm5\n\t" | |
603 "punpckhbw %%mm7, %%mm1\n\t" | |
604 "punpckhbw %%mm7, %%mm3\n\t" | |
605 "paddusw %%mm5, %%mm4\n\t" | |
606 "paddusw %%mm3, %%mm1\n\t" | |
607 "paddusw %%mm6, %%mm4\n\t" | |
608 "paddusw %%mm6, %%mm1\n\t" | |
609 "paddusw %%mm4, %%mm0\n\t" | |
610 "paddusw %%mm1, %%mm2\n\t" | |
611 "movq %3, %%mm5\n\t" | |
612 "psrlw $2, %%mm0\n\t" | |
613 "movq %0, %%mm1\n\t" | |
614 "psrlw $2, %%mm2\n\t" | |
615 "movq %%mm1, %%mm3\n\t" | |
616 "punpcklbw %%mm7, %%mm1\n\t" | |
617 "punpckhbw %%mm7, %%mm3\n\t" | |
618 "paddusw %%mm1, %%mm0\n\t" | |
619 "paddusw %%mm3, %%mm2\n\t" | |
620 "paddusw %%mm5, %%mm0\n\t" | |
621 "paddusw %%mm5, %%mm2\n\t" | |
622 "psrlw $1, %%mm0\n\t" | |
623 "psrlw $1, %%mm2\n\t" | |
624 "packuswb %%mm2, %%mm0\n\t" | |
625 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
626 :"+m"(*p) |
0 | 627 :"m"(*pix), |
8 | 628 "m"(*(pix+line_size)), "m"(mm_wone) |
0 | 629 :"memory"); |
630 pix += line_size; | |
631 p += line_size ; | |
632 } while(--h); | |
633 } | |
634 | |
635 static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
636 { | |
637 UINT8 *p; | |
638 const UINT8 *pix; | |
639 p = block; | |
640 pix = pixels; | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
641 __asm __volatile("pxor %%mm7, %%mm7\n\t":); |
0 | 642 do { |
643 __asm __volatile( | |
644 "movq %1, %%mm0\n\t" | |
645 "movq %0, %%mm1\n\t" | |
646 "movq %%mm0, %%mm2\n\t" | |
647 "movq %%mm1, %%mm3\n\t" | |
648 "punpcklbw %%mm7, %%mm0\n\t" | |
649 "punpcklbw %%mm7, %%mm1\n\t" | |
650 "punpckhbw %%mm7, %%mm2\n\t" | |
651 "punpckhbw %%mm7, %%mm3\n\t" | |
652 "paddusw %%mm1, %%mm0\n\t" | |
653 "paddusw %%mm3, %%mm2\n\t" | |
654 "psrlw $1, %%mm0\n\t" | |
655 "psrlw $1, %%mm2\n\t" | |
656 "packuswb %%mm2, %%mm0\n\t" | |
657 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
658 :"+m"(*p) |
0 | 659 :"m"(*pix) |
660 :"memory"); | |
661 pix += line_size; | |
662 p += line_size ; | |
663 } while (--h); | |
664 } | |
665 | |
666 static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
667 { | |
668 UINT8 *p; | |
669 const UINT8 *pix; | |
670 p = block; | |
671 pix = pixels; | |
672 __asm __volatile( | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
673 "pxor %%mm7, %%mm7\n\t":); |
0 | 674 do { |
675 __asm __volatile( | |
676 "movq %1, %%mm0\n\t" | |
677 "movq 1%1, %%mm1\n\t" | |
678 "movq %0, %%mm4\n\t" | |
679 "movq %%mm0, %%mm2\n\t" | |
680 "movq %%mm1, %%mm3\n\t" | |
681 "movq %%mm4, %%mm5\n\t" | |
682 "punpcklbw %%mm7, %%mm0\n\t" | |
683 "punpcklbw %%mm7, %%mm1\n\t" | |
684 "punpckhbw %%mm7, %%mm2\n\t" | |
685 "punpckhbw %%mm7, %%mm3\n\t" | |
686 "punpcklbw %%mm7, %%mm4\n\t" | |
687 "punpckhbw %%mm7, %%mm5\n\t" | |
688 "paddusw %%mm1, %%mm0\n\t" | |
689 "paddusw %%mm3, %%mm2\n\t" | |
690 "psrlw $1, %%mm0\n\t" | |
691 "psrlw $1, %%mm2\n\t" | |
692 "paddusw %%mm4, %%mm0\n\t" | |
693 "paddusw %%mm5, %%mm2\n\t" | |
694 "psrlw $1, %%mm0\n\t" | |
695 "psrlw $1, %%mm2\n\t" | |
696 "packuswb %%mm2, %%mm0\n\t" | |
697 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
698 :"+m"(*p) |
0 | 699 :"m"(*pix) |
700 :"memory"); | |
701 pix += line_size; | |
702 p += line_size; | |
703 } while (--h); | |
704 } | |
705 | |
706 static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
707 { | |
708 UINT8 *p; | |
709 const UINT8 *pix; | |
710 p = block; | |
711 pix = pixels; | |
712 __asm __volatile( | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
713 "pxor %%mm7, %%mm7\n\t":); |
0 | 714 do { |
715 __asm __volatile( | |
716 "movq %1, %%mm0\n\t" | |
717 "movq %2, %%mm1\n\t" | |
718 "movq %0, %%mm4\n\t" | |
719 "movq %%mm0, %%mm2\n\t" | |
720 "movq %%mm1, %%mm3\n\t" | |
721 "movq %%mm4, %%mm5\n\t" | |
722 "punpcklbw %%mm7, %%mm0\n\t" | |
723 "punpcklbw %%mm7, %%mm1\n\t" | |
724 "punpckhbw %%mm7, %%mm2\n\t" | |
725 "punpckhbw %%mm7, %%mm3\n\t" | |
726 "punpcklbw %%mm7, %%mm4\n\t" | |
727 "punpckhbw %%mm7, %%mm5\n\t" | |
728 "paddusw %%mm1, %%mm0\n\t" | |
729 "paddusw %%mm3, %%mm2\n\t" | |
730 "psrlw $1, %%mm0\n\t" | |
731 "psrlw $1, %%mm2\n\t" | |
732 "paddusw %%mm4, %%mm0\n\t" | |
733 "paddusw %%mm5, %%mm2\n\t" | |
734 "psrlw $1, %%mm0\n\t" | |
735 "psrlw $1, %%mm2\n\t" | |
736 "packuswb %%mm2, %%mm0\n\t" | |
737 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
738 :"+m"(*p) |
0 | 739 :"m"(*pix), "m"(*(pix+line_size)) |
740 :"memory"); | |
741 pix += line_size; | |
742 p += line_size ; | |
743 } while(--h); | |
744 } | |
745 | |
746 static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
747 { | |
748 UINT8 *p; | |
749 const UINT8 *pix; | |
750 p = block; | |
751 pix = pixels; | |
752 __asm __volatile( | |
753 "pxor %%mm7, %%mm7\n\t" | |
754 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
755 ::"m"(mm_wone)); |
0 | 756 do { |
757 __asm __volatile( | |
758 "movq %1, %%mm0\n\t" | |
759 "movq %2, %%mm1\n\t" | |
760 "movq 1%1, %%mm4\n\t" | |
761 "movq 1%2, %%mm5\n\t" | |
762 "movq %%mm0, %%mm2\n\t" | |
763 "movq %%mm1, %%mm3\n\t" | |
764 "punpcklbw %%mm7, %%mm0\n\t" | |
765 "punpcklbw %%mm7, %%mm1\n\t" | |
766 "punpckhbw %%mm7, %%mm2\n\t" | |
767 "punpckhbw %%mm7, %%mm3\n\t" | |
768 "paddusw %%mm1, %%mm0\n\t" | |
769 "paddusw %%mm3, %%mm2\n\t" | |
770 "movq %%mm4, %%mm1\n\t" | |
771 "movq %%mm5, %%mm3\n\t" | |
772 "punpcklbw %%mm7, %%mm4\n\t" | |
773 "punpcklbw %%mm7, %%mm5\n\t" | |
774 "punpckhbw %%mm7, %%mm1\n\t" | |
775 "punpckhbw %%mm7, %%mm3\n\t" | |
776 "paddusw %%mm5, %%mm4\n\t" | |
777 "paddusw %%mm3, %%mm1\n\t" | |
778 "paddusw %%mm6, %%mm4\n\t" | |
779 "paddusw %%mm6, %%mm1\n\t" | |
780 "paddusw %%mm4, %%mm0\n\t" | |
781 "paddusw %%mm1, %%mm2\n\t" | |
782 "movq %0, %%mm1\n\t" | |
783 "psrlw $2, %%mm0\n\t" | |
784 "movq %%mm1, %%mm3\n\t" | |
785 "psrlw $2, %%mm2\n\t" | |
786 "punpcklbw %%mm7, %%mm1\n\t" | |
787 "punpckhbw %%mm7, %%mm3\n\t" | |
788 "paddusw %%mm1, %%mm0\n\t" | |
789 "paddusw %%mm3, %%mm2\n\t" | |
790 "psrlw $1, %%mm0\n\t" | |
791 "psrlw $1, %%mm2\n\t" | |
792 "packuswb %%mm2, %%mm0\n\t" | |
793 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
794 :"+m"(*p) |
0 | 795 :"m"(*pix), |
796 "m"(*(pix+line_size)) | |
797 :"memory"); | |
798 pix += line_size; | |
799 p += line_size; | |
800 } while(--h); | |
801 } | |
802 | |
803 static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
804 { | |
805 DCTELEM *p; | |
806 const UINT8 *pix; | |
807 p = block; | |
808 pix = pixels; | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
809 __asm __volatile("pxor %%mm7, %%mm7":); |
0 | 810 do { |
811 __asm __volatile( | |
812 "movq %0, %%mm0\n\t" | |
813 "movq %1, %%mm2\n\t" | |
814 "movq 8%0, %%mm1\n\t" | |
815 "movq %%mm2, %%mm3\n\t" | |
816 "punpcklbw %%mm7, %%mm2\n\t" | |
817 "punpckhbw %%mm7, %%mm3\n\t" | |
818 "psubsw %%mm2, %%mm0\n\t" | |
819 "psubsw %%mm3, %%mm1\n\t" | |
820 "movq %%mm0, %0\n\t" | |
821 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
822 :"+m"(*p) |
0 | 823 :"m"(*pix) |
824 :"memory"); | |
825 pix += line_size; | |
826 p += 8; | |
827 } while (--h); | |
828 } | |
829 | |
830 static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
831 { | |
832 DCTELEM *p; | |
833 const UINT8 *pix; | |
834 p = block; | |
835 pix = pixels; | |
836 __asm __volatile( | |
837 "pxor %%mm7, %%mm7\n\t" | |
838 "movq %0, %%mm6" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
839 ::"m"(mm_wone)); |
0 | 840 do { |
841 __asm __volatile( | |
842 "movq %0, %%mm0\n\t" | |
843 "movq %1, %%mm2\n\t" | |
844 "movq 8%0, %%mm1\n\t" | |
845 "movq 1%1, %%mm4\n\t" | |
846 "movq %%mm2, %%mm3\n\t" | |
847 "movq %%mm4, %%mm5\n\t" | |
848 "punpcklbw %%mm7, %%mm2\n\t" | |
849 "punpckhbw %%mm7, %%mm3\n\t" | |
850 "punpcklbw %%mm7, %%mm4\n\t" | |
851 "punpckhbw %%mm7, %%mm5\n\t" | |
852 "paddusw %%mm4, %%mm2\n\t" | |
853 "paddusw %%mm5, %%mm3\n\t" | |
854 "paddusw %%mm6, %%mm2\n\t" | |
855 "paddusw %%mm6, %%mm3\n\t" | |
856 "psrlw $1, %%mm2\n\t" | |
857 "psrlw $1, %%mm3\n\t" | |
858 "psubsw %%mm2, %%mm0\n\t" | |
859 "psubsw %%mm3, %%mm1\n\t" | |
860 "movq %%mm0, %0\n\t" | |
861 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
862 :"+m"(*p) |
0 | 863 :"m"(*pix) |
864 :"memory"); | |
865 pix += line_size; | |
866 p += 8; | |
867 } while (--h); | |
868 } | |
869 | |
870 static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
871 { | |
872 DCTELEM *p; | |
873 const UINT8 *pix; | |
874 p = block; | |
875 pix = pixels; | |
876 __asm __volatile( | |
877 "pxor %%mm7, %%mm7\n\t" | |
878 "movq %0, %%mm6" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
879 ::"m"(mm_wone)); |
0 | 880 do { |
881 __asm __volatile( | |
882 "movq %0, %%mm0\n\t" | |
883 "movq %1, %%mm2\n\t" | |
884 "movq 8%0, %%mm1\n\t" | |
885 "movq %2, %%mm4\n\t" | |
886 "movq %%mm2, %%mm3\n\t" | |
887 "movq %%mm4, %%mm5\n\t" | |
888 "punpcklbw %%mm7, %%mm2\n\t" | |
889 "punpckhbw %%mm7, %%mm3\n\t" | |
890 "punpcklbw %%mm7, %%mm4\n\t" | |
891 "punpckhbw %%mm7, %%mm5\n\t" | |
892 "paddusw %%mm4, %%mm2\n\t" | |
893 "paddusw %%mm5, %%mm3\n\t" | |
894 "paddusw %%mm6, %%mm2\n\t" | |
895 "paddusw %%mm6, %%mm3\n\t" | |
896 "psrlw $1, %%mm2\n\t" | |
897 "psrlw $1, %%mm3\n\t" | |
898 "psubsw %%mm2, %%mm0\n\t" | |
899 "psubsw %%mm3, %%mm1\n\t" | |
900 "movq %%mm0, %0\n\t" | |
901 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
902 :"+m"(*p) |
0 | 903 :"m"(*pix), "m"(*(pix+line_size)) |
904 :"memory"); | |
905 pix += line_size; | |
906 p += 8; | |
907 } while (--h); | |
908 } | |
909 | |
910 static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
911 { | |
912 DCTELEM *p; | |
913 const UINT8 *pix; | |
914 p = block; | |
915 pix = pixels; | |
916 __asm __volatile( | |
917 "pxor %%mm7, %%mm7\n\t" | |
918 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
919 ::"m"(mm_wtwo)); |
0 | 920 do { |
921 __asm __volatile( | |
922 "movq %1, %%mm0\n\t" | |
923 "movq %2, %%mm1\n\t" | |
924 "movq 1%1, %%mm4\n\t" | |
925 "movq 1%2, %%mm5\n\t" | |
926 "movq %%mm0, %%mm2\n\t" | |
927 "movq %%mm1, %%mm3\n\t" | |
928 "punpcklbw %%mm7, %%mm0\n\t" | |
929 "punpcklbw %%mm7, %%mm1\n\t" | |
930 "punpckhbw %%mm7, %%mm2\n\t" | |
931 "punpckhbw %%mm7, %%mm3\n\t" | |
932 "paddusw %%mm1, %%mm0\n\t" | |
933 "paddusw %%mm3, %%mm2\n\t" | |
934 "movq %%mm4, %%mm1\n\t" | |
935 "movq %%mm5, %%mm3\n\t" | |
936 "punpcklbw %%mm7, %%mm4\n\t" | |
937 "punpcklbw %%mm7, %%mm5\n\t" | |
938 "punpckhbw %%mm7, %%mm1\n\t" | |
939 "punpckhbw %%mm7, %%mm3\n\t" | |
940 "paddusw %%mm5, %%mm4\n\t" | |
941 "paddusw %%mm3, %%mm1\n\t" | |
942 "paddusw %%mm6, %%mm4\n\t" | |
943 "paddusw %%mm6, %%mm1\n\t" | |
944 "paddusw %%mm4, %%mm0\n\t" | |
945 "paddusw %%mm1, %%mm2\n\t" | |
946 "movq %0, %%mm1\n\t" | |
947 "movq 8%0, %%mm3\n\t" | |
948 "psrlw $2, %%mm0\n\t" | |
949 "psrlw $2, %%mm2\n\t" | |
950 "psubsw %%mm0, %%mm1\n\t" | |
951 "psubsw %%mm2, %%mm3\n\t" | |
952 "movq %%mm1, %0\n\t" | |
953 "movq %%mm3, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
954 :"+m"(*p) |
0 | 955 :"m"(*pix), |
956 "m"(*(pix+line_size)) | |
957 :"memory"); | |
958 pix += line_size; | |
959 p += 8 ; | |
960 } while(--h); | |
961 } | |
962 | |
963 void dsputil_init_mmx(void) | |
964 { | |
965 mm_flags = mm_support(); | |
188 | 966 #if 1 |
967 printf("libavcodec: CPU flags:"); | |
0 | 968 if (mm_flags & MM_MMX) |
969 printf(" mmx"); | |
970 if (mm_flags & MM_MMXEXT) | |
971 printf(" mmxext"); | |
972 if (mm_flags & MM_3DNOW) | |
973 printf(" 3dnow"); | |
974 if (mm_flags & MM_SSE) | |
975 printf(" sse"); | |
976 if (mm_flags & MM_SSE2) | |
977 printf(" sse2"); | |
978 printf("\n"); | |
979 #endif | |
980 | |
981 if (mm_flags & MM_MMX) { | |
982 get_pixels = get_pixels_mmx; | |
983 put_pixels_clamped = put_pixels_clamped_mmx; | |
984 add_pixels_clamped = add_pixels_clamped_mmx; | |
985 | |
986 pix_abs16x16 = pix_abs16x16_mmx; | |
987 pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
988 pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
989 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; | |
990 av_fdct = fdct_mmx; | |
991 | |
992 put_pixels_tab[0] = put_pixels_mmx; | |
993 put_pixels_tab[1] = put_pixels_x2_mmx; | |
994 put_pixels_tab[2] = put_pixels_y2_mmx; | |
995 put_pixels_tab[3] = put_pixels_xy2_mmx; | |
996 | |
997 put_no_rnd_pixels_tab[0] = put_pixels_mmx; | |
998 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
999 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1000 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | |
1001 | |
1002 avg_pixels_tab[0] = avg_pixels_mmx; | |
1003 avg_pixels_tab[1] = avg_pixels_x2_mmx; | |
1004 avg_pixels_tab[2] = avg_pixels_y2_mmx; | |
1005 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1006 | |
1007 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | |
1008 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | |
1009 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | |
1010 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | |
1011 | |
1012 sub_pixels_tab[0] = sub_pixels_mmx; | |
1013 sub_pixels_tab[1] = sub_pixels_x2_mmx; | |
1014 sub_pixels_tab[2] = sub_pixels_y2_mmx; | |
1015 sub_pixels_tab[3] = sub_pixels_xy2_mmx; | |
1016 | |
1017 if (mm_flags & MM_MMXEXT) { | |
1018 pix_abs16x16 = pix_abs16x16_sse; | |
1019 } | |
1020 | |
1021 if (mm_flags & MM_SSE) { | |
1022 put_pixels_tab[1] = put_pixels_x2_sse; | |
1023 put_pixels_tab[2] = put_pixels_y2_sse; | |
1024 | |
1025 avg_pixels_tab[0] = avg_pixels_sse; | |
1026 avg_pixels_tab[1] = avg_pixels_x2_sse; | |
1027 avg_pixels_tab[2] = avg_pixels_y2_sse; | |
1028 avg_pixels_tab[3] = avg_pixels_xy2_sse; | |
1029 | |
1030 sub_pixels_tab[1] = sub_pixels_x2_sse; | |
1031 sub_pixels_tab[2] = sub_pixels_y2_sse; | |
1032 } else if (mm_flags & MM_3DNOW) { | |
1033 put_pixels_tab[1] = put_pixels_x2_3dnow; | |
1034 put_pixels_tab[2] = put_pixels_y2_3dnow; | |
1035 | |
1036 avg_pixels_tab[0] = avg_pixels_3dnow; | |
1037 avg_pixels_tab[1] = avg_pixels_x2_3dnow; | |
1038 avg_pixels_tab[2] = avg_pixels_y2_3dnow; | |
1039 avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | |
1040 | |
1041 sub_pixels_tab[1] = sub_pixels_x2_3dnow; | |
1042 sub_pixels_tab[2] = sub_pixels_y2_3dnow; | |
1043 } | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1044 |
42 | 1045 /* idct */ |
1046 if (mm_flags & MM_MMXEXT) { | |
1047 ff_idct = ff_mmxext_idct; | |
1048 } else { | |
1049 ff_idct = ff_mmx_idct; | |
1050 } | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1051 #ifdef SIMPLE_IDCT |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1052 // ff_idct = simple_idct; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1053 ff_idct = simple_idct_mmx; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1054 #endif |
0 | 1055 } |
1056 } |