Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 160:1bf8c111691d libavcodec
- Bug fixed on H.263 decoder initialization.
author | pulento |
---|---|
date | Sat, 17 Nov 2001 15:43:04 +0000 |
parents | ae0516eadae2 |
children | ac5075a55488 |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
3 * Copyright (c) 2000, 2001 Gerard Lantau. | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
18 * | |
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 */ | |
21 | |
22 #include "../dsputil.h" | |
23 | |
5 | 24 int mm_flags; /* multimedia extension flags */ |
25 | |
0 | 26 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); |
27 int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
28 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
29 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
30 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
31 | |
42 | 32 /* external functions, from idct_mmx.c */ |
33 void ff_mmx_idct(DCTELEM *block); | |
34 void ff_mmxext_idct(DCTELEM *block); | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
35 |
0 | 36 /* pixel operations */ |
8 | 37 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; |
38 static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; | |
39 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; | |
40 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |
0 | 41 |
42 /***********************************/ | |
43 /* 3Dnow specific */ | |
44 | |
45 #define DEF(x) x ## _3dnow | |
46 /* for Athlons PAVGUSB is prefered */ | |
47 #define PAVGB "pavgusb" | |
48 | |
49 #include "dsputil_mmx_avg.h" | |
50 | |
51 #undef DEF | |
52 #undef PAVGB | |
53 | |
54 /***********************************/ | |
55 /* MMX2 specific */ | |
56 | |
57 #define DEF(x) x ## _sse | |
58 | |
59 /* Introduced only in MMX2 set */ | |
60 #define PAVGB "pavgb" | |
61 | |
62 #include "dsputil_mmx_avg.h" | |
63 | |
64 #undef DEF | |
65 #undef PAVGB | |
66 | |
67 /***********************************/ | |
68 /* standard MMX */ | |
69 | |
70 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
71 { | |
72 DCTELEM *p; | |
73 const UINT8 *pix; | |
74 int i; | |
75 | |
76 /* read the pixels */ | |
77 p = block; | |
78 pix = pixels; | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
79 __asm __volatile("pxor %%mm7, %%mm7":); |
0 | 80 for(i=0;i<4;i++) { |
81 __asm __volatile( | |
82 "movq %1, %%mm0\n\t" | |
83 "movq %2, %%mm1\n\t" | |
84 "movq %%mm0, %%mm2\n\t" | |
85 "movq %%mm1, %%mm3\n\t" | |
86 "punpcklbw %%mm7, %%mm0\n\t" | |
87 "punpckhbw %%mm7, %%mm2\n\t" | |
88 "punpcklbw %%mm7, %%mm1\n\t" | |
89 "punpckhbw %%mm7, %%mm3\n\t" | |
90 "movq %%mm0, %0\n\t" | |
91 "movq %%mm2, 8%0\n\t" | |
92 "movq %%mm1, 16%0\n\t" | |
93 "movq %%mm3, 24%0\n\t" | |
94 :"=m"(*p) | |
95 :"m"(*pix), "m"(*(pix+line_size)) | |
96 :"memory"); | |
97 pix += line_size*2; | |
98 p += 16; | |
99 } | |
100 } | |
101 | |
102 static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
103 { | |
104 const DCTELEM *p; | |
105 UINT8 *pix; | |
106 int i; | |
107 | |
108 /* read the pixels */ | |
109 p = block; | |
110 pix = pixels; | |
111 for(i=0;i<2;i++) { | |
112 __asm __volatile( | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
113 "movq %3, %%mm0\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
114 "movq 8%3, %%mm1\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
115 "movq 16%3, %%mm2\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
116 "movq 24%3, %%mm3\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
117 "movq 32%3, %%mm4\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
118 "movq 40%3, %%mm5\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
119 "movq 48%3, %%mm6\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
120 "movq 56%3, %%mm7\n\t" |
0 | 121 "packuswb %%mm1, %%mm0\n\t" |
122 "packuswb %%mm3, %%mm2\n\t" | |
123 "packuswb %%mm5, %%mm4\n\t" | |
124 "packuswb %%mm7, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
125 "movq %%mm0, (%0)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
126 "movq %%mm2, (%0, %1)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
127 "movq %%mm4, (%0, %1, 2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
128 "movq %%mm6, (%0, %2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
129 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) |
0 | 130 :"memory"); |
131 pix += line_size*4; | |
132 p += 32; | |
133 } | |
134 } | |
135 | |
136 static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
137 { | |
138 const DCTELEM *p; | |
139 UINT8 *pix; | |
140 int i; | |
141 | |
142 /* read the pixels */ | |
143 p = block; | |
144 pix = pixels; | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
145 __asm __volatile("pxor %%mm7, %%mm7":); |
0 | 146 for(i=0;i<4;i++) { |
147 __asm __volatile( | |
148 "movq %2, %%mm0\n\t" | |
149 "movq 8%2, %%mm1\n\t" | |
150 "movq 16%2, %%mm2\n\t" | |
151 "movq 24%2, %%mm3\n\t" | |
152 "movq %0, %%mm4\n\t" | |
153 "movq %1, %%mm6\n\t" | |
154 "movq %%mm4, %%mm5\n\t" | |
155 "punpcklbw %%mm7, %%mm4\n\t" | |
156 "punpckhbw %%mm7, %%mm5\n\t" | |
157 "paddsw %%mm4, %%mm0\n\t" | |
158 "paddsw %%mm5, %%mm1\n\t" | |
159 "movq %%mm6, %%mm5\n\t" | |
160 "punpcklbw %%mm7, %%mm6\n\t" | |
161 "punpckhbw %%mm7, %%mm5\n\t" | |
162 "paddsw %%mm6, %%mm2\n\t" | |
163 "paddsw %%mm5, %%mm3\n\t" | |
164 "packuswb %%mm1, %%mm0\n\t" | |
165 "packuswb %%mm3, %%mm2\n\t" | |
166 "movq %%mm0, %0\n\t" | |
167 "movq %%mm2, %1\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
168 :"+m"(*pix), "+m"(*(pix+line_size)) |
0 | 169 :"m"(*p) |
170 :"memory"); | |
171 pix += line_size*2; | |
172 p += 16; | |
173 } | |
174 } | |
175 | |
176 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
177 { | |
178 int dh, hh; | |
179 UINT8 *p; | |
180 const UINT8 *pix; | |
181 p = block; | |
182 pix = pixels; | |
183 hh=h>>2; | |
184 dh=h&3; | |
185 while(hh--) { | |
186 __asm __volatile( | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
187 "movq (%1), %%mm0 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
188 "movq (%1, %2), %%mm1 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
189 "movq (%1, %2, 2), %%mm2 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
190 "movq (%1, %3), %%mm3 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
191 "movq %%mm0, (%0) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
192 "movq %%mm1, (%0, %2) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
193 "movq %%mm2, (%0, %2, 2) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
194 "movq %%mm3, (%0, %3) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
195 ::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3) |
0 | 196 :"memory"); |
197 pix = pix + line_size*4; | |
198 p = p + line_size*4; | |
199 } | |
200 while(dh--) { | |
201 __asm __volatile( | |
202 "movq %1, %%mm0\n\t" | |
203 "movq %%mm0, %0\n\t" | |
204 :"=m"(*p) | |
205 :"m"(*pix) | |
206 :"memory"); | |
207 pix = pix + line_size; | |
208 p = p + line_size; | |
209 } | |
210 } | |
211 | |
212 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
213 { | |
214 UINT8 *p; | |
215 const UINT8 *pix; | |
216 p = block; | |
217 pix = pixels; | |
218 __asm __volatile( | |
219 "pxor %%mm7, %%mm7\n\t" | |
220 "movq %0, %%mm4\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
221 ::"m"(mm_wone)); |
0 | 222 do { |
223 __asm __volatile( | |
224 "movq %1, %%mm0\n\t" | |
225 "movq 1%1, %%mm1\n\t" | |
226 "movq %%mm0, %%mm2\n\t" | |
227 "movq %%mm1, %%mm3\n\t" | |
228 "punpcklbw %%mm7, %%mm0\n\t" | |
229 "punpcklbw %%mm7, %%mm1\n\t" | |
230 "punpckhbw %%mm7, %%mm2\n\t" | |
231 "punpckhbw %%mm7, %%mm3\n\t" | |
232 "paddusw %%mm1, %%mm0\n\t" | |
233 "paddusw %%mm3, %%mm2\n\t" | |
234 "paddusw %%mm4, %%mm0\n\t" | |
235 "paddusw %%mm4, %%mm2\n\t" | |
236 "psrlw $1, %%mm0\n\t" | |
237 "psrlw $1, %%mm2\n\t" | |
238 "packuswb %%mm2, %%mm0\n\t" | |
239 "movq %%mm0, %0\n\t" | |
240 :"=m"(*p) | |
241 :"m"(*pix) | |
242 :"memory"); | |
243 pix += line_size; p += line_size; | |
244 } while (--h); | |
245 } | |
246 | |
247 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
248 { | |
249 UINT8 *p; | |
250 const UINT8 *pix; | |
251 p = block; | |
252 pix = pixels; | |
253 __asm __volatile( | |
254 "pxor %%mm7, %%mm7\n\t" | |
255 "movq %0, %%mm4\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
256 ::"m"(mm_wone)); |
0 | 257 do { |
258 __asm __volatile( | |
259 "movq %1, %%mm0\n\t" | |
260 "movq %2, %%mm1\n\t" | |
261 "movq %%mm0, %%mm2\n\t" | |
262 "movq %%mm1, %%mm3\n\t" | |
263 "punpcklbw %%mm7, %%mm0\n\t" | |
264 "punpcklbw %%mm7, %%mm1\n\t" | |
265 "punpckhbw %%mm7, %%mm2\n\t" | |
266 "punpckhbw %%mm7, %%mm3\n\t" | |
267 "paddusw %%mm1, %%mm0\n\t" | |
268 "paddusw %%mm3, %%mm2\n\t" | |
269 "paddusw %%mm4, %%mm0\n\t" | |
270 "paddusw %%mm4, %%mm2\n\t" | |
271 "psrlw $1, %%mm0\n\t" | |
272 "psrlw $1, %%mm2\n\t" | |
273 "packuswb %%mm2, %%mm0\n\t" | |
274 "movq %%mm0, %0\n\t" | |
275 :"=m"(*p) | |
276 :"m"(*pix), | |
277 "m"(*(pix+line_size)) | |
278 :"memory"); | |
279 pix += line_size; | |
280 p += line_size; | |
281 } while (--h); | |
282 } | |
283 | |
284 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
285 { | |
286 UINT8 *p; | |
287 const UINT8 *pix; | |
288 p = block; | |
289 pix = pixels; | |
290 __asm __volatile( | |
291 "pxor %%mm7, %%mm7\n\t" | |
292 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
293 ::"m"(mm_wtwo)); |
0 | 294 do { |
295 __asm __volatile( | |
296 "movq %1, %%mm0\n\t" | |
297 "movq %2, %%mm1\n\t" | |
298 "movq 1%1, %%mm4\n\t" | |
299 "movq 1%2, %%mm5\n\t" | |
300 "movq %%mm0, %%mm2\n\t" | |
301 "movq %%mm1, %%mm3\n\t" | |
302 "punpcklbw %%mm7, %%mm0\n\t" | |
303 "punpcklbw %%mm7, %%mm1\n\t" | |
304 "punpckhbw %%mm7, %%mm2\n\t" | |
305 "punpckhbw %%mm7, %%mm3\n\t" | |
306 "paddusw %%mm1, %%mm0\n\t" | |
307 "paddusw %%mm3, %%mm2\n\t" | |
308 "movq %%mm4, %%mm1\n\t" | |
309 "movq %%mm5, %%mm3\n\t" | |
310 "punpcklbw %%mm7, %%mm4\n\t" | |
311 "punpcklbw %%mm7, %%mm5\n\t" | |
312 "punpckhbw %%mm7, %%mm1\n\t" | |
313 "punpckhbw %%mm7, %%mm3\n\t" | |
314 "paddusw %%mm5, %%mm4\n\t" | |
315 "paddusw %%mm3, %%mm1\n\t" | |
316 "paddusw %%mm6, %%mm4\n\t" | |
317 "paddusw %%mm6, %%mm1\n\t" | |
318 "paddusw %%mm4, %%mm0\n\t" | |
319 "paddusw %%mm1, %%mm2\n\t" | |
320 "psrlw $2, %%mm0\n\t" | |
321 "psrlw $2, %%mm2\n\t" | |
322 "packuswb %%mm2, %%mm0\n\t" | |
323 "movq %%mm0, %0\n\t" | |
324 :"=m"(*p) | |
325 :"m"(*pix), | |
326 "m"(*(pix+line_size)) | |
327 :"memory"); | |
328 pix += line_size; | |
329 p += line_size; | |
330 } while(--h); | |
331 } | |
332 | |
333 static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
334 { | |
335 UINT8 *p; | |
336 const UINT8 *pix; | |
337 p = block; | |
338 pix = pixels; | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
339 __asm __volatile("pxor %%mm7, %%mm7\n\t":); |
0 | 340 do { |
341 __asm __volatile( | |
342 "movq %1, %%mm0\n\t" | |
343 "movq 1%1, %%mm1\n\t" | |
344 "movq %%mm0, %%mm2\n\t" | |
345 "movq %%mm1, %%mm3\n\t" | |
346 "punpcklbw %%mm7, %%mm0\n\t" | |
347 "punpcklbw %%mm7, %%mm1\n\t" | |
348 "punpckhbw %%mm7, %%mm2\n\t" | |
349 "punpckhbw %%mm7, %%mm3\n\t" | |
350 "paddusw %%mm1, %%mm0\n\t" | |
351 "paddusw %%mm3, %%mm2\n\t" | |
352 "psrlw $1, %%mm0\n\t" | |
353 "psrlw $1, %%mm2\n\t" | |
354 "packuswb %%mm2, %%mm0\n\t" | |
355 "movq %%mm0, %0\n\t" | |
356 :"=m"(*p) | |
357 :"m"(*pix) | |
358 :"memory"); | |
359 pix += line_size; | |
360 p += line_size; | |
361 } while (--h); | |
362 } | |
363 | |
364 static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
365 { | |
366 UINT8 *p; | |
367 const UINT8 *pix; | |
368 p = block; | |
369 pix = pixels; | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
370 __asm __volatile("pxor %%mm7, %%mm7\n\t":); |
0 | 371 do { |
372 __asm __volatile( | |
373 "movq %1, %%mm0\n\t" | |
374 "movq %2, %%mm1\n\t" | |
375 "movq %%mm0, %%mm2\n\t" | |
376 "movq %%mm1, %%mm3\n\t" | |
377 "punpcklbw %%mm7, %%mm0\n\t" | |
378 "punpcklbw %%mm7, %%mm1\n\t" | |
379 "punpckhbw %%mm7, %%mm2\n\t" | |
380 "punpckhbw %%mm7, %%mm3\n\t" | |
381 "paddusw %%mm1, %%mm0\n\t" | |
382 "paddusw %%mm3, %%mm2\n\t" | |
383 "psrlw $1, %%mm0\n\t" | |
384 "psrlw $1, %%mm2\n\t" | |
385 "packuswb %%mm2, %%mm0\n\t" | |
386 "movq %%mm0, %0\n\t" | |
387 :"=m"(*p) | |
388 :"m"(*pix), | |
389 "m"(*(pix+line_size)) | |
390 :"memory"); | |
391 pix += line_size; | |
392 p += line_size; | |
393 } while(--h); | |
394 } | |
395 | |
396 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
397 { | |
398 UINT8 *p; | |
399 const UINT8 *pix; | |
400 p = block; | |
401 pix = pixels; | |
402 __asm __volatile( | |
403 "pxor %%mm7, %%mm7\n\t" | |
404 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
405 ::"m"(mm_wone)); |
0 | 406 do { |
407 __asm __volatile( | |
408 "movq %1, %%mm0\n\t" | |
409 "movq %2, %%mm1\n\t" | |
410 "movq 1%1, %%mm4\n\t" | |
411 "movq 1%2, %%mm5\n\t" | |
412 "movq %%mm0, %%mm2\n\t" | |
413 "movq %%mm1, %%mm3\n\t" | |
414 "punpcklbw %%mm7, %%mm0\n\t" | |
415 "punpcklbw %%mm7, %%mm1\n\t" | |
416 "punpckhbw %%mm7, %%mm2\n\t" | |
417 "punpckhbw %%mm7, %%mm3\n\t" | |
418 "paddusw %%mm1, %%mm0\n\t" | |
419 "paddusw %%mm3, %%mm2\n\t" | |
420 "movq %%mm4, %%mm1\n\t" | |
421 "movq %%mm5, %%mm3\n\t" | |
422 "punpcklbw %%mm7, %%mm4\n\t" | |
423 "punpcklbw %%mm7, %%mm5\n\t" | |
424 "punpckhbw %%mm7, %%mm1\n\t" | |
425 "punpckhbw %%mm7, %%mm3\n\t" | |
426 "paddusw %%mm5, %%mm4\n\t" | |
427 "paddusw %%mm3, %%mm1\n\t" | |
428 "paddusw %%mm6, %%mm4\n\t" | |
429 "paddusw %%mm6, %%mm1\n\t" | |
430 "paddusw %%mm4, %%mm0\n\t" | |
431 "paddusw %%mm1, %%mm2\n\t" | |
432 "psrlw $2, %%mm0\n\t" | |
433 "psrlw $2, %%mm2\n\t" | |
434 "packuswb %%mm2, %%mm0\n\t" | |
435 "movq %%mm0, %0\n\t" | |
436 :"=m"(*p) | |
437 :"m"(*pix), | |
438 "m"(*(pix+line_size)) | |
439 :"memory"); | |
440 pix += line_size; | |
441 p += line_size; | |
442 } while(--h); | |
443 } | |
444 | |
445 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
446 { | |
447 UINT8 *p; | |
448 const UINT8 *pix; | |
449 p = block; | |
450 pix = pixels; | |
451 __asm __volatile( | |
452 "pxor %%mm7, %%mm7\n\t" | |
453 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
454 ::"m"(mm_wone)); |
0 | 455 do { |
456 __asm __volatile( | |
457 "movq %0, %%mm0\n\t" | |
458 "movq %1, %%mm1\n\t" | |
459 "movq %%mm0, %%mm2\n\t" | |
460 "movq %%mm1, %%mm3\n\t" | |
461 "punpcklbw %%mm7, %%mm0\n\t" | |
462 "punpcklbw %%mm7, %%mm1\n\t" | |
463 "punpckhbw %%mm7, %%mm2\n\t" | |
464 "punpckhbw %%mm7, %%mm3\n\t" | |
465 "paddusw %%mm1, %%mm0\n\t" | |
466 "paddusw %%mm3, %%mm2\n\t" | |
467 "paddusw %%mm6, %%mm0\n\t" | |
468 "paddusw %%mm6, %%mm2\n\t" | |
469 "psrlw $1, %%mm0\n\t" | |
470 "psrlw $1, %%mm2\n\t" | |
471 "packuswb %%mm2, %%mm0\n\t" | |
472 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
473 :"+m"(*p) |
0 | 474 :"m"(*pix) |
475 :"memory"); | |
476 pix += line_size; | |
477 p += line_size; | |
478 } | |
479 while (--h); | |
480 } | |
481 | |
482 static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
483 { | |
484 UINT8 *p; | |
485 const UINT8 *pix; | |
486 p = block; | |
487 pix = pixels; | |
488 __asm __volatile( | |
489 "pxor %%mm7, %%mm7\n\t" | |
490 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
491 ::"m"(mm_wone)); |
0 | 492 do { |
493 __asm __volatile( | |
494 "movq %1, %%mm1\n\t" | |
495 "movq %0, %%mm0\n\t" | |
496 "movq 1%1, %%mm4\n\t" | |
497 "movq %%mm0, %%mm2\n\t" | |
498 "movq %%mm1, %%mm3\n\t" | |
499 "movq %%mm4, %%mm5\n\t" | |
500 "punpcklbw %%mm7, %%mm1\n\t" | |
501 "punpckhbw %%mm7, %%mm3\n\t" | |
502 "punpcklbw %%mm7, %%mm4\n\t" | |
503 "punpckhbw %%mm7, %%mm5\n\t" | |
504 "punpcklbw %%mm7, %%mm0\n\t" | |
505 "punpckhbw %%mm7, %%mm2\n\t" | |
506 "paddusw %%mm4, %%mm1\n\t" | |
507 "paddusw %%mm5, %%mm3\n\t" | |
508 "paddusw %%mm6, %%mm1\n\t" | |
509 "paddusw %%mm6, %%mm3\n\t" | |
510 "psrlw $1, %%mm1\n\t" | |
511 "psrlw $1, %%mm3\n\t" | |
512 "paddusw %%mm6, %%mm0\n\t" | |
513 "paddusw %%mm6, %%mm2\n\t" | |
514 "paddusw %%mm1, %%mm0\n\t" | |
515 "paddusw %%mm3, %%mm2\n\t" | |
516 "psrlw $1, %%mm0\n\t" | |
517 "psrlw $1, %%mm2\n\t" | |
518 "packuswb %%mm2, %%mm0\n\t" | |
519 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
520 :"+m"(*p) |
0 | 521 :"m"(*pix) |
522 :"memory"); | |
523 pix += line_size; | |
524 p += line_size; | |
525 } while (--h); | |
526 } | |
527 | |
528 static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
529 { | |
530 UINT8 *p; | |
531 const UINT8 *pix; | |
532 p = block; | |
533 pix = pixels; | |
534 __asm __volatile( | |
535 "pxor %%mm7, %%mm7\n\t" | |
536 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
537 ::"m"(mm_wone)); |
0 | 538 do { |
539 __asm __volatile( | |
540 "movq %1, %%mm1\n\t" | |
541 "movq %0, %%mm0\n\t" | |
542 "movq %2, %%mm4\n\t" | |
543 "movq %%mm0, %%mm2\n\t" | |
544 "movq %%mm1, %%mm3\n\t" | |
545 "movq %%mm4, %%mm5\n\t" | |
546 "punpcklbw %%mm7, %%mm1\n\t" | |
547 "punpckhbw %%mm7, %%mm3\n\t" | |
548 "punpcklbw %%mm7, %%mm4\n\t" | |
549 "punpckhbw %%mm7, %%mm5\n\t" | |
550 "punpcklbw %%mm7, %%mm0\n\t" | |
551 "punpckhbw %%mm7, %%mm2\n\t" | |
552 "paddusw %%mm4, %%mm1\n\t" | |
553 "paddusw %%mm5, %%mm3\n\t" | |
554 "paddusw %%mm6, %%mm1\n\t" | |
555 "paddusw %%mm6, %%mm3\n\t" | |
556 "psrlw $1, %%mm1\n\t" | |
557 "psrlw $1, %%mm3\n\t" | |
558 "paddusw %%mm6, %%mm0\n\t" | |
559 "paddusw %%mm6, %%mm2\n\t" | |
560 "paddusw %%mm1, %%mm0\n\t" | |
561 "paddusw %%mm3, %%mm2\n\t" | |
562 "psrlw $1, %%mm0\n\t" | |
563 "psrlw $1, %%mm2\n\t" | |
564 "packuswb %%mm2, %%mm0\n\t" | |
565 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
566 :"+m"(*p) |
0 | 567 :"m"(*pix), "m"(*(pix+line_size)) |
568 :"memory"); | |
569 pix += line_size; | |
570 p += line_size ; | |
571 } while(--h); | |
572 } | |
573 | |
574 static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
575 { | |
576 UINT8 *p; | |
577 const UINT8 *pix; | |
578 p = block; | |
579 pix = pixels; | |
580 __asm __volatile( | |
581 "pxor %%mm7, %%mm7\n\t" | |
582 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
583 ::"m"(mm_wtwo)); |
0 | 584 do { |
585 __asm __volatile( | |
586 "movq %1, %%mm0\n\t" | |
587 "movq %2, %%mm1\n\t" | |
588 "movq 1%1, %%mm4\n\t" | |
589 "movq 1%2, %%mm5\n\t" | |
590 "movq %%mm0, %%mm2\n\t" | |
591 "movq %%mm1, %%mm3\n\t" | |
592 "punpcklbw %%mm7, %%mm0\n\t" | |
593 "punpcklbw %%mm7, %%mm1\n\t" | |
594 "punpckhbw %%mm7, %%mm2\n\t" | |
595 "punpckhbw %%mm7, %%mm3\n\t" | |
596 "paddusw %%mm1, %%mm0\n\t" | |
597 "paddusw %%mm3, %%mm2\n\t" | |
598 "movq %%mm4, %%mm1\n\t" | |
599 "movq %%mm5, %%mm3\n\t" | |
600 "punpcklbw %%mm7, %%mm4\n\t" | |
601 "punpcklbw %%mm7, %%mm5\n\t" | |
602 "punpckhbw %%mm7, %%mm1\n\t" | |
603 "punpckhbw %%mm7, %%mm3\n\t" | |
604 "paddusw %%mm5, %%mm4\n\t" | |
605 "paddusw %%mm3, %%mm1\n\t" | |
606 "paddusw %%mm6, %%mm4\n\t" | |
607 "paddusw %%mm6, %%mm1\n\t" | |
608 "paddusw %%mm4, %%mm0\n\t" | |
609 "paddusw %%mm1, %%mm2\n\t" | |
610 "movq %3, %%mm5\n\t" | |
611 "psrlw $2, %%mm0\n\t" | |
612 "movq %0, %%mm1\n\t" | |
613 "psrlw $2, %%mm2\n\t" | |
614 "movq %%mm1, %%mm3\n\t" | |
615 "punpcklbw %%mm7, %%mm1\n\t" | |
616 "punpckhbw %%mm7, %%mm3\n\t" | |
617 "paddusw %%mm1, %%mm0\n\t" | |
618 "paddusw %%mm3, %%mm2\n\t" | |
619 "paddusw %%mm5, %%mm0\n\t" | |
620 "paddusw %%mm5, %%mm2\n\t" | |
621 "psrlw $1, %%mm0\n\t" | |
622 "psrlw $1, %%mm2\n\t" | |
623 "packuswb %%mm2, %%mm0\n\t" | |
624 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
625 :"+m"(*p) |
0 | 626 :"m"(*pix), |
8 | 627 "m"(*(pix+line_size)), "m"(mm_wone) |
0 | 628 :"memory"); |
629 pix += line_size; | |
630 p += line_size ; | |
631 } while(--h); | |
632 } | |
633 | |
634 static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
635 { | |
636 UINT8 *p; | |
637 const UINT8 *pix; | |
638 p = block; | |
639 pix = pixels; | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
640 __asm __volatile("pxor %%mm7, %%mm7\n\t":); |
0 | 641 do { |
642 __asm __volatile( | |
643 "movq %1, %%mm0\n\t" | |
644 "movq %0, %%mm1\n\t" | |
645 "movq %%mm0, %%mm2\n\t" | |
646 "movq %%mm1, %%mm3\n\t" | |
647 "punpcklbw %%mm7, %%mm0\n\t" | |
648 "punpcklbw %%mm7, %%mm1\n\t" | |
649 "punpckhbw %%mm7, %%mm2\n\t" | |
650 "punpckhbw %%mm7, %%mm3\n\t" | |
651 "paddusw %%mm1, %%mm0\n\t" | |
652 "paddusw %%mm3, %%mm2\n\t" | |
653 "psrlw $1, %%mm0\n\t" | |
654 "psrlw $1, %%mm2\n\t" | |
655 "packuswb %%mm2, %%mm0\n\t" | |
656 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
657 :"+m"(*p) |
0 | 658 :"m"(*pix) |
659 :"memory"); | |
660 pix += line_size; | |
661 p += line_size ; | |
662 } while (--h); | |
663 } | |
664 | |
665 static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
666 { | |
667 UINT8 *p; | |
668 const UINT8 *pix; | |
669 p = block; | |
670 pix = pixels; | |
671 __asm __volatile( | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
672 "pxor %%mm7, %%mm7\n\t":); |
0 | 673 do { |
674 __asm __volatile( | |
675 "movq %1, %%mm0\n\t" | |
676 "movq 1%1, %%mm1\n\t" | |
677 "movq %0, %%mm4\n\t" | |
678 "movq %%mm0, %%mm2\n\t" | |
679 "movq %%mm1, %%mm3\n\t" | |
680 "movq %%mm4, %%mm5\n\t" | |
681 "punpcklbw %%mm7, %%mm0\n\t" | |
682 "punpcklbw %%mm7, %%mm1\n\t" | |
683 "punpckhbw %%mm7, %%mm2\n\t" | |
684 "punpckhbw %%mm7, %%mm3\n\t" | |
685 "punpcklbw %%mm7, %%mm4\n\t" | |
686 "punpckhbw %%mm7, %%mm5\n\t" | |
687 "paddusw %%mm1, %%mm0\n\t" | |
688 "paddusw %%mm3, %%mm2\n\t" | |
689 "psrlw $1, %%mm0\n\t" | |
690 "psrlw $1, %%mm2\n\t" | |
691 "paddusw %%mm4, %%mm0\n\t" | |
692 "paddusw %%mm5, %%mm2\n\t" | |
693 "psrlw $1, %%mm0\n\t" | |
694 "psrlw $1, %%mm2\n\t" | |
695 "packuswb %%mm2, %%mm0\n\t" | |
696 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
697 :"+m"(*p) |
0 | 698 :"m"(*pix) |
699 :"memory"); | |
700 pix += line_size; | |
701 p += line_size; | |
702 } while (--h); | |
703 } | |
704 | |
705 static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
706 { | |
707 UINT8 *p; | |
708 const UINT8 *pix; | |
709 p = block; | |
710 pix = pixels; | |
711 __asm __volatile( | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
712 "pxor %%mm7, %%mm7\n\t":); |
0 | 713 do { |
714 __asm __volatile( | |
715 "movq %1, %%mm0\n\t" | |
716 "movq %2, %%mm1\n\t" | |
717 "movq %0, %%mm4\n\t" | |
718 "movq %%mm0, %%mm2\n\t" | |
719 "movq %%mm1, %%mm3\n\t" | |
720 "movq %%mm4, %%mm5\n\t" | |
721 "punpcklbw %%mm7, %%mm0\n\t" | |
722 "punpcklbw %%mm7, %%mm1\n\t" | |
723 "punpckhbw %%mm7, %%mm2\n\t" | |
724 "punpckhbw %%mm7, %%mm3\n\t" | |
725 "punpcklbw %%mm7, %%mm4\n\t" | |
726 "punpckhbw %%mm7, %%mm5\n\t" | |
727 "paddusw %%mm1, %%mm0\n\t" | |
728 "paddusw %%mm3, %%mm2\n\t" | |
729 "psrlw $1, %%mm0\n\t" | |
730 "psrlw $1, %%mm2\n\t" | |
731 "paddusw %%mm4, %%mm0\n\t" | |
732 "paddusw %%mm5, %%mm2\n\t" | |
733 "psrlw $1, %%mm0\n\t" | |
734 "psrlw $1, %%mm2\n\t" | |
735 "packuswb %%mm2, %%mm0\n\t" | |
736 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
737 :"+m"(*p) |
0 | 738 :"m"(*pix), "m"(*(pix+line_size)) |
739 :"memory"); | |
740 pix += line_size; | |
741 p += line_size ; | |
742 } while(--h); | |
743 } | |
744 | |
745 static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
746 { | |
747 UINT8 *p; | |
748 const UINT8 *pix; | |
749 p = block; | |
750 pix = pixels; | |
751 __asm __volatile( | |
752 "pxor %%mm7, %%mm7\n\t" | |
753 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
754 ::"m"(mm_wone)); |
0 | 755 do { |
756 __asm __volatile( | |
757 "movq %1, %%mm0\n\t" | |
758 "movq %2, %%mm1\n\t" | |
759 "movq 1%1, %%mm4\n\t" | |
760 "movq 1%2, %%mm5\n\t" | |
761 "movq %%mm0, %%mm2\n\t" | |
762 "movq %%mm1, %%mm3\n\t" | |
763 "punpcklbw %%mm7, %%mm0\n\t" | |
764 "punpcklbw %%mm7, %%mm1\n\t" | |
765 "punpckhbw %%mm7, %%mm2\n\t" | |
766 "punpckhbw %%mm7, %%mm3\n\t" | |
767 "paddusw %%mm1, %%mm0\n\t" | |
768 "paddusw %%mm3, %%mm2\n\t" | |
769 "movq %%mm4, %%mm1\n\t" | |
770 "movq %%mm5, %%mm3\n\t" | |
771 "punpcklbw %%mm7, %%mm4\n\t" | |
772 "punpcklbw %%mm7, %%mm5\n\t" | |
773 "punpckhbw %%mm7, %%mm1\n\t" | |
774 "punpckhbw %%mm7, %%mm3\n\t" | |
775 "paddusw %%mm5, %%mm4\n\t" | |
776 "paddusw %%mm3, %%mm1\n\t" | |
777 "paddusw %%mm6, %%mm4\n\t" | |
778 "paddusw %%mm6, %%mm1\n\t" | |
779 "paddusw %%mm4, %%mm0\n\t" | |
780 "paddusw %%mm1, %%mm2\n\t" | |
781 "movq %0, %%mm1\n\t" | |
782 "psrlw $2, %%mm0\n\t" | |
783 "movq %%mm1, %%mm3\n\t" | |
784 "psrlw $2, %%mm2\n\t" | |
785 "punpcklbw %%mm7, %%mm1\n\t" | |
786 "punpckhbw %%mm7, %%mm3\n\t" | |
787 "paddusw %%mm1, %%mm0\n\t" | |
788 "paddusw %%mm3, %%mm2\n\t" | |
789 "psrlw $1, %%mm0\n\t" | |
790 "psrlw $1, %%mm2\n\t" | |
791 "packuswb %%mm2, %%mm0\n\t" | |
792 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
793 :"+m"(*p) |
0 | 794 :"m"(*pix), |
795 "m"(*(pix+line_size)) | |
796 :"memory"); | |
797 pix += line_size; | |
798 p += line_size; | |
799 } while(--h); | |
800 } | |
801 | |
802 static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
803 { | |
804 DCTELEM *p; | |
805 const UINT8 *pix; | |
806 p = block; | |
807 pix = pixels; | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
808 __asm __volatile("pxor %%mm7, %%mm7":); |
0 | 809 do { |
810 __asm __volatile( | |
811 "movq %0, %%mm0\n\t" | |
812 "movq %1, %%mm2\n\t" | |
813 "movq 8%0, %%mm1\n\t" | |
814 "movq %%mm2, %%mm3\n\t" | |
815 "punpcklbw %%mm7, %%mm2\n\t" | |
816 "punpckhbw %%mm7, %%mm3\n\t" | |
817 "psubsw %%mm2, %%mm0\n\t" | |
818 "psubsw %%mm3, %%mm1\n\t" | |
819 "movq %%mm0, %0\n\t" | |
820 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
821 :"+m"(*p) |
0 | 822 :"m"(*pix) |
823 :"memory"); | |
824 pix += line_size; | |
825 p += 8; | |
826 } while (--h); | |
827 } | |
828 | |
829 static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
830 { | |
831 DCTELEM *p; | |
832 const UINT8 *pix; | |
833 p = block; | |
834 pix = pixels; | |
835 __asm __volatile( | |
836 "pxor %%mm7, %%mm7\n\t" | |
837 "movq %0, %%mm6" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
838 ::"m"(mm_wone)); |
0 | 839 do { |
840 __asm __volatile( | |
841 "movq %0, %%mm0\n\t" | |
842 "movq %1, %%mm2\n\t" | |
843 "movq 8%0, %%mm1\n\t" | |
844 "movq 1%1, %%mm4\n\t" | |
845 "movq %%mm2, %%mm3\n\t" | |
846 "movq %%mm4, %%mm5\n\t" | |
847 "punpcklbw %%mm7, %%mm2\n\t" | |
848 "punpckhbw %%mm7, %%mm3\n\t" | |
849 "punpcklbw %%mm7, %%mm4\n\t" | |
850 "punpckhbw %%mm7, %%mm5\n\t" | |
851 "paddusw %%mm4, %%mm2\n\t" | |
852 "paddusw %%mm5, %%mm3\n\t" | |
853 "paddusw %%mm6, %%mm2\n\t" | |
854 "paddusw %%mm6, %%mm3\n\t" | |
855 "psrlw $1, %%mm2\n\t" | |
856 "psrlw $1, %%mm3\n\t" | |
857 "psubsw %%mm2, %%mm0\n\t" | |
858 "psubsw %%mm3, %%mm1\n\t" | |
859 "movq %%mm0, %0\n\t" | |
860 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
861 :"+m"(*p) |
0 | 862 :"m"(*pix) |
863 :"memory"); | |
864 pix += line_size; | |
865 p += 8; | |
866 } while (--h); | |
867 } | |
868 | |
869 static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
870 { | |
871 DCTELEM *p; | |
872 const UINT8 *pix; | |
873 p = block; | |
874 pix = pixels; | |
875 __asm __volatile( | |
876 "pxor %%mm7, %%mm7\n\t" | |
877 "movq %0, %%mm6" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
878 ::"m"(mm_wone)); |
0 | 879 do { |
880 __asm __volatile( | |
881 "movq %0, %%mm0\n\t" | |
882 "movq %1, %%mm2\n\t" | |
883 "movq 8%0, %%mm1\n\t" | |
884 "movq %2, %%mm4\n\t" | |
885 "movq %%mm2, %%mm3\n\t" | |
886 "movq %%mm4, %%mm5\n\t" | |
887 "punpcklbw %%mm7, %%mm2\n\t" | |
888 "punpckhbw %%mm7, %%mm3\n\t" | |
889 "punpcklbw %%mm7, %%mm4\n\t" | |
890 "punpckhbw %%mm7, %%mm5\n\t" | |
891 "paddusw %%mm4, %%mm2\n\t" | |
892 "paddusw %%mm5, %%mm3\n\t" | |
893 "paddusw %%mm6, %%mm2\n\t" | |
894 "paddusw %%mm6, %%mm3\n\t" | |
895 "psrlw $1, %%mm2\n\t" | |
896 "psrlw $1, %%mm3\n\t" | |
897 "psubsw %%mm2, %%mm0\n\t" | |
898 "psubsw %%mm3, %%mm1\n\t" | |
899 "movq %%mm0, %0\n\t" | |
900 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
901 :"+m"(*p) |
0 | 902 :"m"(*pix), "m"(*(pix+line_size)) |
903 :"memory"); | |
904 pix += line_size; | |
905 p += 8; | |
906 } while (--h); | |
907 } | |
908 | |
909 static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
910 { | |
911 DCTELEM *p; | |
912 const UINT8 *pix; | |
913 p = block; | |
914 pix = pixels; | |
915 __asm __volatile( | |
916 "pxor %%mm7, %%mm7\n\t" | |
917 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
918 ::"m"(mm_wtwo)); |
0 | 919 do { |
920 __asm __volatile( | |
921 "movq %1, %%mm0\n\t" | |
922 "movq %2, %%mm1\n\t" | |
923 "movq 1%1, %%mm4\n\t" | |
924 "movq 1%2, %%mm5\n\t" | |
925 "movq %%mm0, %%mm2\n\t" | |
926 "movq %%mm1, %%mm3\n\t" | |
927 "punpcklbw %%mm7, %%mm0\n\t" | |
928 "punpcklbw %%mm7, %%mm1\n\t" | |
929 "punpckhbw %%mm7, %%mm2\n\t" | |
930 "punpckhbw %%mm7, %%mm3\n\t" | |
931 "paddusw %%mm1, %%mm0\n\t" | |
932 "paddusw %%mm3, %%mm2\n\t" | |
933 "movq %%mm4, %%mm1\n\t" | |
934 "movq %%mm5, %%mm3\n\t" | |
935 "punpcklbw %%mm7, %%mm4\n\t" | |
936 "punpcklbw %%mm7, %%mm5\n\t" | |
937 "punpckhbw %%mm7, %%mm1\n\t" | |
938 "punpckhbw %%mm7, %%mm3\n\t" | |
939 "paddusw %%mm5, %%mm4\n\t" | |
940 "paddusw %%mm3, %%mm1\n\t" | |
941 "paddusw %%mm6, %%mm4\n\t" | |
942 "paddusw %%mm6, %%mm1\n\t" | |
943 "paddusw %%mm4, %%mm0\n\t" | |
944 "paddusw %%mm1, %%mm2\n\t" | |
945 "movq %0, %%mm1\n\t" | |
946 "movq 8%0, %%mm3\n\t" | |
947 "psrlw $2, %%mm0\n\t" | |
948 "psrlw $2, %%mm2\n\t" | |
949 "psubsw %%mm0, %%mm1\n\t" | |
950 "psubsw %%mm2, %%mm3\n\t" | |
951 "movq %%mm1, %0\n\t" | |
952 "movq %%mm3, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
953 :"+m"(*p) |
0 | 954 :"m"(*pix), |
955 "m"(*(pix+line_size)) | |
956 :"memory"); | |
957 pix += line_size; | |
958 p += 8 ; | |
959 } while(--h); | |
960 } | |
961 | |
962 void dsputil_init_mmx(void) | |
963 { | |
964 mm_flags = mm_support(); | |
965 #if 0 | |
966 printf("CPU flags:"); | |
967 if (mm_flags & MM_MMX) | |
968 printf(" mmx"); | |
969 if (mm_flags & MM_MMXEXT) | |
970 printf(" mmxext"); | |
971 if (mm_flags & MM_3DNOW) | |
972 printf(" 3dnow"); | |
973 if (mm_flags & MM_SSE) | |
974 printf(" sse"); | |
975 if (mm_flags & MM_SSE2) | |
976 printf(" sse2"); | |
977 printf("\n"); | |
978 #endif | |
979 | |
980 if (mm_flags & MM_MMX) { | |
981 get_pixels = get_pixels_mmx; | |
982 put_pixels_clamped = put_pixels_clamped_mmx; | |
983 add_pixels_clamped = add_pixels_clamped_mmx; | |
984 | |
985 pix_abs16x16 = pix_abs16x16_mmx; | |
986 pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
987 pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
988 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; | |
989 av_fdct = fdct_mmx; | |
990 | |
991 put_pixels_tab[0] = put_pixels_mmx; | |
992 put_pixels_tab[1] = put_pixels_x2_mmx; | |
993 put_pixels_tab[2] = put_pixels_y2_mmx; | |
994 put_pixels_tab[3] = put_pixels_xy2_mmx; | |
995 | |
996 put_no_rnd_pixels_tab[0] = put_pixels_mmx; | |
997 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
998 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
999 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | |
1000 | |
1001 avg_pixels_tab[0] = avg_pixels_mmx; | |
1002 avg_pixels_tab[1] = avg_pixels_x2_mmx; | |
1003 avg_pixels_tab[2] = avg_pixels_y2_mmx; | |
1004 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1005 | |
1006 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | |
1007 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | |
1008 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | |
1009 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | |
1010 | |
1011 sub_pixels_tab[0] = sub_pixels_mmx; | |
1012 sub_pixels_tab[1] = sub_pixels_x2_mmx; | |
1013 sub_pixels_tab[2] = sub_pixels_y2_mmx; | |
1014 sub_pixels_tab[3] = sub_pixels_xy2_mmx; | |
1015 | |
1016 if (mm_flags & MM_MMXEXT) { | |
1017 pix_abs16x16 = pix_abs16x16_sse; | |
1018 } | |
1019 | |
1020 if (mm_flags & MM_SSE) { | |
1021 put_pixels_tab[1] = put_pixels_x2_sse; | |
1022 put_pixels_tab[2] = put_pixels_y2_sse; | |
1023 | |
1024 avg_pixels_tab[0] = avg_pixels_sse; | |
1025 avg_pixels_tab[1] = avg_pixels_x2_sse; | |
1026 avg_pixels_tab[2] = avg_pixels_y2_sse; | |
1027 avg_pixels_tab[3] = avg_pixels_xy2_sse; | |
1028 | |
1029 sub_pixels_tab[1] = sub_pixels_x2_sse; | |
1030 sub_pixels_tab[2] = sub_pixels_y2_sse; | |
1031 } else if (mm_flags & MM_3DNOW) { | |
1032 put_pixels_tab[1] = put_pixels_x2_3dnow; | |
1033 put_pixels_tab[2] = put_pixels_y2_3dnow; | |
1034 | |
1035 avg_pixels_tab[0] = avg_pixels_3dnow; | |
1036 avg_pixels_tab[1] = avg_pixels_x2_3dnow; | |
1037 avg_pixels_tab[2] = avg_pixels_y2_3dnow; | |
1038 avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | |
1039 | |
1040 sub_pixels_tab[1] = sub_pixels_x2_3dnow; | |
1041 sub_pixels_tab[2] = sub_pixels_y2_3dnow; | |
1042 } | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1043 |
42 | 1044 /* idct */ |
1045 if (mm_flags & MM_MMXEXT) { | |
1046 ff_idct = ff_mmxext_idct; | |
1047 } else { | |
1048 ff_idct = ff_mmx_idct; | |
1049 } | |
0 | 1050 } |
1051 } |