Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 314:289eb941b8ba libavcodec
* encoding of AC3 with more than 2 channels
by Takashi Iwai <tiwai@suse.de>
author | kabi |
---|---|
date | Mon, 08 Apr 2002 12:08:03 +0000 |
parents | c1a8a1b4a24b |
children | 9c6f056f0e41 |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
3 * Copyright (c) 2000, 2001 Gerard Lantau. | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
18 * | |
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 */ | |
21 | |
22 #include "../dsputil.h" | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
23 #include "../simple_idct.h" |
0 | 24 |
5 | 25 int mm_flags; /* multimedia extension flags */ |
26 | |
294 | 27 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx); |
28 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
29 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
30 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
31 | |
32 int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
33 int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
34 int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
35 int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
36 | |
37 int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
38 int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
39 int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
40 int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
41 | |
42 int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
43 int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
44 int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
45 int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
46 | |
0 | 47 |
42 | 48 /* external functions, from idct_mmx.c */ |
49 void ff_mmx_idct(DCTELEM *block); | |
50 void ff_mmxext_idct(DCTELEM *block); | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
51 |
0 | 52 /* pixel operations */ |
294 | 53 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001LL; |
54 static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002LL; | |
8 | 55 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; |
56 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |
0 | 57 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
58 #define JUMPALIGN() __asm __volatile (".balign 8"::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
59 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
60 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
61 #ifndef PIC |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
62 #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
63 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
64 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
65 // for shared library it's better to use this way for accessing constants |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
66 // pcmpeqd -> -1 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
67 #define MOVQ_WONE(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
68 __asm __volatile ( \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
69 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
70 "psrlw $15, %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
71 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
72 #define MOVQ_WTWO(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
73 __asm __volatile ( \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
74 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
75 "psrlw $15, %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
76 "psllw $1, %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
77 #endif |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
78 |
0 | 79 /***********************************/ |
80 /* 3Dnow specific */ | |
81 | |
82 #define DEF(x) x ## _3dnow | |
83 /* for Athlons PAVGUSB is prefered */ | |
84 #define PAVGB "pavgusb" | |
85 | |
86 #include "dsputil_mmx_avg.h" | |
87 | |
88 #undef DEF | |
89 #undef PAVGB | |
90 | |
91 /***********************************/ | |
92 /* MMX2 specific */ | |
93 | |
94 #define DEF(x) x ## _sse | |
95 | |
96 /* Introduced only in MMX2 set */ | |
97 #define PAVGB "pavgb" | |
98 | |
99 #include "dsputil_mmx_avg.h" | |
100 | |
101 #undef DEF | |
102 #undef PAVGB | |
103 | |
104 /***********************************/ | |
105 /* standard MMX */ | |
106 | |
107 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
108 { | |
109 DCTELEM *p; | |
110 const UINT8 *pix; | |
111 int i; | |
112 | |
113 /* read the pixels */ | |
114 p = block; | |
115 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
116 MOVQ_ZERO(mm7); |
0 | 117 for(i=0;i<4;i++) { |
118 __asm __volatile( | |
119 "movq %1, %%mm0\n\t" | |
120 "movq %2, %%mm1\n\t" | |
121 "movq %%mm0, %%mm2\n\t" | |
122 "movq %%mm1, %%mm3\n\t" | |
123 "punpcklbw %%mm7, %%mm0\n\t" | |
124 "punpckhbw %%mm7, %%mm2\n\t" | |
125 "punpcklbw %%mm7, %%mm1\n\t" | |
126 "punpckhbw %%mm7, %%mm3\n\t" | |
127 "movq %%mm0, %0\n\t" | |
128 "movq %%mm2, 8%0\n\t" | |
129 "movq %%mm1, 16%0\n\t" | |
130 "movq %%mm3, 24%0\n\t" | |
131 :"=m"(*p) | |
132 :"m"(*pix), "m"(*(pix+line_size)) | |
133 :"memory"); | |
134 pix += line_size*2; | |
135 p += 16; | |
136 } | |
137 } | |
138 | |
139 static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
140 { | |
141 const DCTELEM *p; | |
142 UINT8 *pix; | |
143 | |
144 /* read the pixels */ | |
145 p = block; | |
146 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
147 /* unrolled loop */ |
0 | 148 __asm __volatile( |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
149 "movq %3, %%mm0\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
150 "movq 8%3, %%mm1\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
151 "movq 16%3, %%mm2\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
152 "movq 24%3, %%mm3\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
153 "movq 32%3, %%mm4\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
154 "movq 40%3, %%mm5\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
155 "movq 48%3, %%mm6\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
156 "movq 56%3, %%mm7\n\t" |
0 | 157 "packuswb %%mm1, %%mm0\n\t" |
158 "packuswb %%mm3, %%mm2\n\t" | |
159 "packuswb %%mm5, %%mm4\n\t" | |
160 "packuswb %%mm7, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
161 "movq %%mm0, (%0)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
162 "movq %%mm2, (%0, %1)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
163 "movq %%mm4, (%0, %1, 2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
164 "movq %%mm6, (%0, %2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
165 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) |
0 | 166 :"memory"); |
167 pix += line_size*4; | |
168 p += 32; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
169 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
170 // if here would be an exact copy of the code above |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
171 // compiler would generate some very strange code |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
172 // thus using "r" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
173 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
174 "movq (%3), %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
175 "movq 8(%3), %%mm1\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
176 "movq 16(%3), %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
177 "movq 24(%3), %%mm3\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
178 "movq 32(%3), %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
179 "movq 40(%3), %%mm5\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
180 "movq 48(%3), %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
181 "movq 56(%3), %%mm7\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
182 "packuswb %%mm1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
183 "packuswb %%mm3, %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
184 "packuswb %%mm5, %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
185 "packuswb %%mm7, %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
186 "movq %%mm0, (%0)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
187 "movq %%mm2, (%0, %1)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
188 "movq %%mm4, (%0, %1, 2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
189 "movq %%mm6, (%0, %2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
190 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
191 :"memory"); |
0 | 192 } |
193 | |
194 static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
195 { | |
196 const DCTELEM *p; | |
197 UINT8 *pix; | |
198 int i; | |
199 | |
200 /* read the pixels */ | |
201 p = block; | |
202 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
203 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
204 i = 4; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
205 while (i) { |
0 | 206 __asm __volatile( |
207 "movq %2, %%mm0\n\t" | |
208 "movq 8%2, %%mm1\n\t" | |
209 "movq 16%2, %%mm2\n\t" | |
210 "movq 24%2, %%mm3\n\t" | |
211 "movq %0, %%mm4\n\t" | |
212 "movq %1, %%mm6\n\t" | |
213 "movq %%mm4, %%mm5\n\t" | |
214 "punpcklbw %%mm7, %%mm4\n\t" | |
215 "punpckhbw %%mm7, %%mm5\n\t" | |
216 "paddsw %%mm4, %%mm0\n\t" | |
217 "paddsw %%mm5, %%mm1\n\t" | |
218 "movq %%mm6, %%mm5\n\t" | |
219 "punpcklbw %%mm7, %%mm6\n\t" | |
220 "punpckhbw %%mm7, %%mm5\n\t" | |
221 "paddsw %%mm6, %%mm2\n\t" | |
222 "paddsw %%mm5, %%mm3\n\t" | |
223 "packuswb %%mm1, %%mm0\n\t" | |
224 "packuswb %%mm3, %%mm2\n\t" | |
225 "movq %%mm0, %0\n\t" | |
226 "movq %%mm2, %1\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
227 :"+m"(*pix), "+m"(*(pix+line_size)) |
0 | 228 :"m"(*p) |
229 :"memory"); | |
230 pix += line_size*2; | |
231 p += 16; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
232 i--; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
233 }; |
0 | 234 } |
235 | |
236 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
237 { | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
238 int hh; |
0 | 239 UINT8 *p; |
240 const UINT8 *pix; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
241 |
0 | 242 p = block; |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
243 pix = pixels; // 2s |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
244 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
245 do { |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
246 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
247 "movq %1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
248 "movq %%mm0, %0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
249 :"=m"(*p) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
250 :"m"(*pix) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
251 :"memory"); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
252 pix += line_size; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
253 p += line_size; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
254 } while (--h); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
255 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
256 // this optimized code is not very usefull |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
257 // the above loop is definitely faster |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
258 // at least on Celeron 500MHz |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
259 hh = h & 3; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
260 while (hh) { |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
261 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
262 "movq %1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
263 "movq %%mm0, %0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
264 :"=m"(*p) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
265 :"m"(*pix) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
266 :"memory"); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
267 pix += line_size; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
268 p += line_size; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
269 hh--; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
270 } |
0 | 271 hh=h>>2; |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
272 while (hh) { |
0 | 273 __asm __volatile( |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
274 "movq (%1), %%mm0 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
275 "movq (%1, %2), %%mm1 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
276 "movq (%1, %2, 2), %%mm2 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
277 "movq (%1, %3), %%mm3 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
278 "movq %%mm0, (%0) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
279 "movq %%mm1, (%0, %2) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
280 "movq %%mm2, (%0, %2, 2) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
281 "movq %%mm3, (%0, %3) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
282 ::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3) |
0 | 283 :"memory"); |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
284 pix += line_size*4; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
285 p += line_size*4; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
286 hh--; |
0 | 287 } |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
288 #endif |
0 | 289 } |
290 | |
291 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
292 { | |
293 UINT8 *p; | |
294 const UINT8 *pix; | |
295 p = block; | |
296 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
297 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
298 MOVQ_WONE(mm4); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
299 JUMPALIGN(); |
0 | 300 do { |
301 __asm __volatile( | |
302 "movq %1, %%mm0\n\t" | |
303 "movq 1%1, %%mm1\n\t" | |
304 "movq %%mm0, %%mm2\n\t" | |
305 "movq %%mm1, %%mm3\n\t" | |
306 "punpcklbw %%mm7, %%mm0\n\t" | |
307 "punpcklbw %%mm7, %%mm1\n\t" | |
308 "punpckhbw %%mm7, %%mm2\n\t" | |
309 "punpckhbw %%mm7, %%mm3\n\t" | |
310 "paddusw %%mm1, %%mm0\n\t" | |
311 "paddusw %%mm3, %%mm2\n\t" | |
312 "paddusw %%mm4, %%mm0\n\t" | |
313 "paddusw %%mm4, %%mm2\n\t" | |
314 "psrlw $1, %%mm0\n\t" | |
315 "psrlw $1, %%mm2\n\t" | |
316 "packuswb %%mm2, %%mm0\n\t" | |
317 "movq %%mm0, %0\n\t" | |
318 :"=m"(*p) | |
319 :"m"(*pix) | |
320 :"memory"); | |
321 pix += line_size; p += line_size; | |
322 } while (--h); | |
323 } | |
324 | |
325 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
326 { | |
327 UINT8 *p; | |
328 const UINT8 *pix; | |
329 p = block; | |
330 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
331 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
332 MOVQ_WONE(mm4); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
333 JUMPALIGN(); |
0 | 334 do { |
335 __asm __volatile( | |
336 "movq %1, %%mm0\n\t" | |
337 "movq %2, %%mm1\n\t" | |
338 "movq %%mm0, %%mm2\n\t" | |
339 "movq %%mm1, %%mm3\n\t" | |
340 "punpcklbw %%mm7, %%mm0\n\t" | |
341 "punpcklbw %%mm7, %%mm1\n\t" | |
342 "punpckhbw %%mm7, %%mm2\n\t" | |
343 "punpckhbw %%mm7, %%mm3\n\t" | |
344 "paddusw %%mm1, %%mm0\n\t" | |
345 "paddusw %%mm3, %%mm2\n\t" | |
346 "paddusw %%mm4, %%mm0\n\t" | |
347 "paddusw %%mm4, %%mm2\n\t" | |
348 "psrlw $1, %%mm0\n\t" | |
349 "psrlw $1, %%mm2\n\t" | |
350 "packuswb %%mm2, %%mm0\n\t" | |
351 "movq %%mm0, %0\n\t" | |
352 :"=m"(*p) | |
353 :"m"(*pix), | |
354 "m"(*(pix+line_size)) | |
355 :"memory"); | |
356 pix += line_size; | |
357 p += line_size; | |
358 } while (--h); | |
359 } | |
360 | |
361 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
362 { | |
363 UINT8 *p; | |
364 const UINT8 *pix; | |
365 p = block; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
366 pix = pixels; // 1s |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
367 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
368 MOVQ_WTWO(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
369 JUMPALIGN(); |
0 | 370 do { |
371 __asm __volatile( | |
372 "movq %1, %%mm0\n\t" | |
373 "movq %2, %%mm1\n\t" | |
374 "movq 1%1, %%mm4\n\t" | |
375 "movq 1%2, %%mm5\n\t" | |
376 "movq %%mm0, %%mm2\n\t" | |
377 "movq %%mm1, %%mm3\n\t" | |
378 "punpcklbw %%mm7, %%mm0\n\t" | |
379 "punpcklbw %%mm7, %%mm1\n\t" | |
380 "punpckhbw %%mm7, %%mm2\n\t" | |
381 "punpckhbw %%mm7, %%mm3\n\t" | |
382 "paddusw %%mm1, %%mm0\n\t" | |
383 "paddusw %%mm3, %%mm2\n\t" | |
384 "movq %%mm4, %%mm1\n\t" | |
385 "movq %%mm5, %%mm3\n\t" | |
386 "punpcklbw %%mm7, %%mm4\n\t" | |
387 "punpcklbw %%mm7, %%mm5\n\t" | |
388 "punpckhbw %%mm7, %%mm1\n\t" | |
389 "punpckhbw %%mm7, %%mm3\n\t" | |
390 "paddusw %%mm5, %%mm4\n\t" | |
391 "paddusw %%mm3, %%mm1\n\t" | |
392 "paddusw %%mm6, %%mm4\n\t" | |
393 "paddusw %%mm6, %%mm1\n\t" | |
394 "paddusw %%mm4, %%mm0\n\t" | |
395 "paddusw %%mm1, %%mm2\n\t" | |
396 "psrlw $2, %%mm0\n\t" | |
397 "psrlw $2, %%mm2\n\t" | |
398 "packuswb %%mm2, %%mm0\n\t" | |
399 "movq %%mm0, %0\n\t" | |
400 :"=m"(*p) | |
401 :"m"(*pix), | |
402 "m"(*(pix+line_size)) | |
403 :"memory"); | |
404 pix += line_size; | |
405 p += line_size; | |
406 } while(--h); | |
407 } | |
408 | |
409 static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
410 { | |
411 UINT8 *p; | |
412 const UINT8 *pix; | |
413 p = block; | |
414 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
415 MOVQ_ZERO(mm7); |
0 | 416 do { |
417 __asm __volatile( | |
418 "movq %1, %%mm0\n\t" | |
419 "movq 1%1, %%mm1\n\t" | |
420 "movq %%mm0, %%mm2\n\t" | |
421 "movq %%mm1, %%mm3\n\t" | |
422 "punpcklbw %%mm7, %%mm0\n\t" | |
423 "punpcklbw %%mm7, %%mm1\n\t" | |
424 "punpckhbw %%mm7, %%mm2\n\t" | |
425 "punpckhbw %%mm7, %%mm3\n\t" | |
426 "paddusw %%mm1, %%mm0\n\t" | |
427 "paddusw %%mm3, %%mm2\n\t" | |
428 "psrlw $1, %%mm0\n\t" | |
429 "psrlw $1, %%mm2\n\t" | |
430 "packuswb %%mm2, %%mm0\n\t" | |
431 "movq %%mm0, %0\n\t" | |
432 :"=m"(*p) | |
433 :"m"(*pix) | |
434 :"memory"); | |
435 pix += line_size; | |
436 p += line_size; | |
437 } while (--h); | |
438 } | |
439 | |
440 static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
441 { | |
442 UINT8 *p; | |
443 const UINT8 *pix; | |
444 p = block; | |
445 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
446 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
447 JUMPALIGN(); |
0 | 448 do { |
449 __asm __volatile( | |
450 "movq %1, %%mm0\n\t" | |
451 "movq %2, %%mm1\n\t" | |
452 "movq %%mm0, %%mm2\n\t" | |
453 "movq %%mm1, %%mm3\n\t" | |
454 "punpcklbw %%mm7, %%mm0\n\t" | |
455 "punpcklbw %%mm7, %%mm1\n\t" | |
456 "punpckhbw %%mm7, %%mm2\n\t" | |
457 "punpckhbw %%mm7, %%mm3\n\t" | |
458 "paddusw %%mm1, %%mm0\n\t" | |
459 "paddusw %%mm3, %%mm2\n\t" | |
460 "psrlw $1, %%mm0\n\t" | |
461 "psrlw $1, %%mm2\n\t" | |
462 "packuswb %%mm2, %%mm0\n\t" | |
463 "movq %%mm0, %0\n\t" | |
464 :"=m"(*p) | |
465 :"m"(*pix), | |
466 "m"(*(pix+line_size)) | |
467 :"memory"); | |
468 pix += line_size; | |
469 p += line_size; | |
470 } while(--h); | |
471 } | |
472 | |
473 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
474 { | |
475 UINT8 *p; | |
476 const UINT8 *pix; | |
477 p = block; | |
478 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
479 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
480 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
481 JUMPALIGN(); |
0 | 482 do { |
483 __asm __volatile( | |
484 "movq %1, %%mm0\n\t" | |
485 "movq %2, %%mm1\n\t" | |
486 "movq 1%1, %%mm4\n\t" | |
487 "movq 1%2, %%mm5\n\t" | |
488 "movq %%mm0, %%mm2\n\t" | |
489 "movq %%mm1, %%mm3\n\t" | |
490 "punpcklbw %%mm7, %%mm0\n\t" | |
491 "punpcklbw %%mm7, %%mm1\n\t" | |
492 "punpckhbw %%mm7, %%mm2\n\t" | |
493 "punpckhbw %%mm7, %%mm3\n\t" | |
494 "paddusw %%mm1, %%mm0\n\t" | |
495 "paddusw %%mm3, %%mm2\n\t" | |
496 "movq %%mm4, %%mm1\n\t" | |
497 "movq %%mm5, %%mm3\n\t" | |
498 "punpcklbw %%mm7, %%mm4\n\t" | |
499 "punpcklbw %%mm7, %%mm5\n\t" | |
500 "punpckhbw %%mm7, %%mm1\n\t" | |
501 "punpckhbw %%mm7, %%mm3\n\t" | |
502 "paddusw %%mm5, %%mm4\n\t" | |
503 "paddusw %%mm3, %%mm1\n\t" | |
504 "paddusw %%mm6, %%mm4\n\t" | |
505 "paddusw %%mm6, %%mm1\n\t" | |
506 "paddusw %%mm4, %%mm0\n\t" | |
507 "paddusw %%mm1, %%mm2\n\t" | |
508 "psrlw $2, %%mm0\n\t" | |
509 "psrlw $2, %%mm2\n\t" | |
510 "packuswb %%mm2, %%mm0\n\t" | |
511 "movq %%mm0, %0\n\t" | |
512 :"=m"(*p) | |
513 :"m"(*pix), | |
514 "m"(*(pix+line_size)) | |
515 :"memory"); | |
516 pix += line_size; | |
517 p += line_size; | |
518 } while(--h); | |
519 } | |
520 | |
521 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
522 { | |
523 UINT8 *p; | |
524 const UINT8 *pix; | |
525 p = block; | |
526 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
527 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
528 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
529 JUMPALIGN(); |
0 | 530 do { |
531 __asm __volatile( | |
532 "movq %0, %%mm0\n\t" | |
533 "movq %1, %%mm1\n\t" | |
534 "movq %%mm0, %%mm2\n\t" | |
535 "movq %%mm1, %%mm3\n\t" | |
536 "punpcklbw %%mm7, %%mm0\n\t" | |
537 "punpcklbw %%mm7, %%mm1\n\t" | |
538 "punpckhbw %%mm7, %%mm2\n\t" | |
539 "punpckhbw %%mm7, %%mm3\n\t" | |
540 "paddusw %%mm1, %%mm0\n\t" | |
541 "paddusw %%mm3, %%mm2\n\t" | |
542 "paddusw %%mm6, %%mm0\n\t" | |
543 "paddusw %%mm6, %%mm2\n\t" | |
544 "psrlw $1, %%mm0\n\t" | |
545 "psrlw $1, %%mm2\n\t" | |
546 "packuswb %%mm2, %%mm0\n\t" | |
547 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
548 :"+m"(*p) |
0 | 549 :"m"(*pix) |
550 :"memory"); | |
551 pix += line_size; | |
552 p += line_size; | |
553 } | |
554 while (--h); | |
555 } | |
556 | |
557 static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
558 { | |
559 UINT8 *p; | |
560 const UINT8 *pix; | |
561 p = block; | |
562 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
563 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
564 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
565 JUMPALIGN(); |
0 | 566 do { |
567 __asm __volatile( | |
568 "movq %1, %%mm1\n\t" | |
569 "movq %0, %%mm0\n\t" | |
570 "movq 1%1, %%mm4\n\t" | |
571 "movq %%mm0, %%mm2\n\t" | |
572 "movq %%mm1, %%mm3\n\t" | |
573 "movq %%mm4, %%mm5\n\t" | |
574 "punpcklbw %%mm7, %%mm1\n\t" | |
575 "punpckhbw %%mm7, %%mm3\n\t" | |
576 "punpcklbw %%mm7, %%mm4\n\t" | |
577 "punpckhbw %%mm7, %%mm5\n\t" | |
578 "punpcklbw %%mm7, %%mm0\n\t" | |
579 "punpckhbw %%mm7, %%mm2\n\t" | |
580 "paddusw %%mm4, %%mm1\n\t" | |
581 "paddusw %%mm5, %%mm3\n\t" | |
582 "paddusw %%mm6, %%mm1\n\t" | |
583 "paddusw %%mm6, %%mm3\n\t" | |
584 "psrlw $1, %%mm1\n\t" | |
585 "psrlw $1, %%mm3\n\t" | |
586 "paddusw %%mm6, %%mm0\n\t" | |
587 "paddusw %%mm6, %%mm2\n\t" | |
588 "paddusw %%mm1, %%mm0\n\t" | |
589 "paddusw %%mm3, %%mm2\n\t" | |
590 "psrlw $1, %%mm0\n\t" | |
591 "psrlw $1, %%mm2\n\t" | |
592 "packuswb %%mm2, %%mm0\n\t" | |
593 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
594 :"+m"(*p) |
0 | 595 :"m"(*pix) |
596 :"memory"); | |
597 pix += line_size; | |
598 p += line_size; | |
599 } while (--h); | |
600 } | |
601 | |
602 static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
603 { | |
604 UINT8 *p; | |
605 const UINT8 *pix; | |
606 p = block; | |
607 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
608 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
609 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
610 JUMPALIGN(); |
0 | 611 do { |
612 __asm __volatile( | |
613 "movq %1, %%mm1\n\t" | |
614 "movq %0, %%mm0\n\t" | |
615 "movq %2, %%mm4\n\t" | |
616 "movq %%mm0, %%mm2\n\t" | |
617 "movq %%mm1, %%mm3\n\t" | |
618 "movq %%mm4, %%mm5\n\t" | |
619 "punpcklbw %%mm7, %%mm1\n\t" | |
620 "punpckhbw %%mm7, %%mm3\n\t" | |
621 "punpcklbw %%mm7, %%mm4\n\t" | |
622 "punpckhbw %%mm7, %%mm5\n\t" | |
623 "punpcklbw %%mm7, %%mm0\n\t" | |
624 "punpckhbw %%mm7, %%mm2\n\t" | |
625 "paddusw %%mm4, %%mm1\n\t" | |
626 "paddusw %%mm5, %%mm3\n\t" | |
627 "paddusw %%mm6, %%mm1\n\t" | |
628 "paddusw %%mm6, %%mm3\n\t" | |
629 "psrlw $1, %%mm1\n\t" | |
630 "psrlw $1, %%mm3\n\t" | |
631 "paddusw %%mm6, %%mm0\n\t" | |
632 "paddusw %%mm6, %%mm2\n\t" | |
633 "paddusw %%mm1, %%mm0\n\t" | |
634 "paddusw %%mm3, %%mm2\n\t" | |
635 "psrlw $1, %%mm0\n\t" | |
636 "psrlw $1, %%mm2\n\t" | |
637 "packuswb %%mm2, %%mm0\n\t" | |
638 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
639 :"+m"(*p) |
0 | 640 :"m"(*pix), "m"(*(pix+line_size)) |
641 :"memory"); | |
642 pix += line_size; | |
643 p += line_size ; | |
644 } while(--h); | |
645 } | |
646 | |
647 static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
648 { | |
649 UINT8 *p; | |
650 const UINT8 *pix; | |
651 p = block; | |
652 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
653 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
654 // this doesn't seem to be used offten - so |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
655 // the inside usage of mm_wone is not optimized |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
656 MOVQ_WTWO(mm6); |
0 | 657 do { |
658 __asm __volatile( | |
659 "movq %1, %%mm0\n\t" | |
660 "movq %2, %%mm1\n\t" | |
661 "movq 1%1, %%mm4\n\t" | |
662 "movq 1%2, %%mm5\n\t" | |
663 "movq %%mm0, %%mm2\n\t" | |
664 "movq %%mm1, %%mm3\n\t" | |
665 "punpcklbw %%mm7, %%mm0\n\t" | |
666 "punpcklbw %%mm7, %%mm1\n\t" | |
667 "punpckhbw %%mm7, %%mm2\n\t" | |
668 "punpckhbw %%mm7, %%mm3\n\t" | |
669 "paddusw %%mm1, %%mm0\n\t" | |
670 "paddusw %%mm3, %%mm2\n\t" | |
671 "movq %%mm4, %%mm1\n\t" | |
672 "movq %%mm5, %%mm3\n\t" | |
673 "punpcklbw %%mm7, %%mm4\n\t" | |
674 "punpcklbw %%mm7, %%mm5\n\t" | |
675 "punpckhbw %%mm7, %%mm1\n\t" | |
676 "punpckhbw %%mm7, %%mm3\n\t" | |
677 "paddusw %%mm5, %%mm4\n\t" | |
678 "paddusw %%mm3, %%mm1\n\t" | |
679 "paddusw %%mm6, %%mm4\n\t" | |
680 "paddusw %%mm6, %%mm1\n\t" | |
681 "paddusw %%mm4, %%mm0\n\t" | |
682 "paddusw %%mm1, %%mm2\n\t" | |
683 "movq %3, %%mm5\n\t" | |
684 "psrlw $2, %%mm0\n\t" | |
685 "movq %0, %%mm1\n\t" | |
686 "psrlw $2, %%mm2\n\t" | |
687 "movq %%mm1, %%mm3\n\t" | |
688 "punpcklbw %%mm7, %%mm1\n\t" | |
689 "punpckhbw %%mm7, %%mm3\n\t" | |
690 "paddusw %%mm1, %%mm0\n\t" | |
691 "paddusw %%mm3, %%mm2\n\t" | |
692 "paddusw %%mm5, %%mm0\n\t" | |
693 "paddusw %%mm5, %%mm2\n\t" | |
694 "psrlw $1, %%mm0\n\t" | |
695 "psrlw $1, %%mm2\n\t" | |
696 "packuswb %%mm2, %%mm0\n\t" | |
697 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
698 :"+m"(*p) |
0 | 699 :"m"(*pix), |
8 | 700 "m"(*(pix+line_size)), "m"(mm_wone) |
0 | 701 :"memory"); |
702 pix += line_size; | |
703 p += line_size ; | |
704 } while(--h); | |
705 } | |
706 | |
707 static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
708 { | |
709 UINT8 *p; | |
710 const UINT8 *pix; | |
711 p = block; | |
712 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
713 MOVQ_ZERO(mm7); |
0 | 714 do { |
715 __asm __volatile( | |
716 "movq %1, %%mm0\n\t" | |
717 "movq %0, %%mm1\n\t" | |
718 "movq %%mm0, %%mm2\n\t" | |
719 "movq %%mm1, %%mm3\n\t" | |
720 "punpcklbw %%mm7, %%mm0\n\t" | |
721 "punpcklbw %%mm7, %%mm1\n\t" | |
722 "punpckhbw %%mm7, %%mm2\n\t" | |
723 "punpckhbw %%mm7, %%mm3\n\t" | |
724 "paddusw %%mm1, %%mm0\n\t" | |
725 "paddusw %%mm3, %%mm2\n\t" | |
726 "psrlw $1, %%mm0\n\t" | |
727 "psrlw $1, %%mm2\n\t" | |
728 "packuswb %%mm2, %%mm0\n\t" | |
729 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
730 :"+m"(*p) |
0 | 731 :"m"(*pix) |
732 :"memory"); | |
733 pix += line_size; | |
734 p += line_size ; | |
735 } while (--h); | |
736 } | |
737 | |
738 static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
739 { | |
740 UINT8 *p; | |
741 const UINT8 *pix; | |
742 p = block; | |
743 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
744 MOVQ_ZERO(mm7); |
0 | 745 do { |
746 __asm __volatile( | |
747 "movq %1, %%mm0\n\t" | |
748 "movq 1%1, %%mm1\n\t" | |
749 "movq %0, %%mm4\n\t" | |
750 "movq %%mm0, %%mm2\n\t" | |
751 "movq %%mm1, %%mm3\n\t" | |
752 "movq %%mm4, %%mm5\n\t" | |
753 "punpcklbw %%mm7, %%mm0\n\t" | |
754 "punpcklbw %%mm7, %%mm1\n\t" | |
755 "punpckhbw %%mm7, %%mm2\n\t" | |
756 "punpckhbw %%mm7, %%mm3\n\t" | |
757 "punpcklbw %%mm7, %%mm4\n\t" | |
758 "punpckhbw %%mm7, %%mm5\n\t" | |
759 "paddusw %%mm1, %%mm0\n\t" | |
760 "paddusw %%mm3, %%mm2\n\t" | |
761 "psrlw $1, %%mm0\n\t" | |
762 "psrlw $1, %%mm2\n\t" | |
763 "paddusw %%mm4, %%mm0\n\t" | |
764 "paddusw %%mm5, %%mm2\n\t" | |
765 "psrlw $1, %%mm0\n\t" | |
766 "psrlw $1, %%mm2\n\t" | |
767 "packuswb %%mm2, %%mm0\n\t" | |
768 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
769 :"+m"(*p) |
0 | 770 :"m"(*pix) |
771 :"memory"); | |
772 pix += line_size; | |
773 p += line_size; | |
774 } while (--h); | |
775 } | |
776 | |
777 static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
778 { | |
779 UINT8 *p; | |
780 const UINT8 *pix; | |
781 p = block; | |
782 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
783 MOVQ_ZERO(mm7); |
0 | 784 do { |
785 __asm __volatile( | |
786 "movq %1, %%mm0\n\t" | |
787 "movq %2, %%mm1\n\t" | |
788 "movq %0, %%mm4\n\t" | |
789 "movq %%mm0, %%mm2\n\t" | |
790 "movq %%mm1, %%mm3\n\t" | |
791 "movq %%mm4, %%mm5\n\t" | |
792 "punpcklbw %%mm7, %%mm0\n\t" | |
793 "punpcklbw %%mm7, %%mm1\n\t" | |
794 "punpckhbw %%mm7, %%mm2\n\t" | |
795 "punpckhbw %%mm7, %%mm3\n\t" | |
796 "punpcklbw %%mm7, %%mm4\n\t" | |
797 "punpckhbw %%mm7, %%mm5\n\t" | |
798 "paddusw %%mm1, %%mm0\n\t" | |
799 "paddusw %%mm3, %%mm2\n\t" | |
800 "psrlw $1, %%mm0\n\t" | |
801 "psrlw $1, %%mm2\n\t" | |
802 "paddusw %%mm4, %%mm0\n\t" | |
803 "paddusw %%mm5, %%mm2\n\t" | |
804 "psrlw $1, %%mm0\n\t" | |
805 "psrlw $1, %%mm2\n\t" | |
806 "packuswb %%mm2, %%mm0\n\t" | |
807 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
808 :"+m"(*p) |
0 | 809 :"m"(*pix), "m"(*(pix+line_size)) |
810 :"memory"); | |
811 pix += line_size; | |
812 p += line_size ; | |
813 } while(--h); | |
814 } | |
815 | |
816 static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
817 { | |
818 UINT8 *p; | |
819 const UINT8 *pix; | |
820 p = block; | |
821 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
822 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
823 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
824 JUMPALIGN(); |
0 | 825 do { |
826 __asm __volatile( | |
827 "movq %1, %%mm0\n\t" | |
828 "movq %2, %%mm1\n\t" | |
829 "movq 1%1, %%mm4\n\t" | |
830 "movq 1%2, %%mm5\n\t" | |
831 "movq %%mm0, %%mm2\n\t" | |
832 "movq %%mm1, %%mm3\n\t" | |
833 "punpcklbw %%mm7, %%mm0\n\t" | |
834 "punpcklbw %%mm7, %%mm1\n\t" | |
835 "punpckhbw %%mm7, %%mm2\n\t" | |
836 "punpckhbw %%mm7, %%mm3\n\t" | |
837 "paddusw %%mm1, %%mm0\n\t" | |
838 "paddusw %%mm3, %%mm2\n\t" | |
839 "movq %%mm4, %%mm1\n\t" | |
840 "movq %%mm5, %%mm3\n\t" | |
841 "punpcklbw %%mm7, %%mm4\n\t" | |
842 "punpcklbw %%mm7, %%mm5\n\t" | |
843 "punpckhbw %%mm7, %%mm1\n\t" | |
844 "punpckhbw %%mm7, %%mm3\n\t" | |
845 "paddusw %%mm5, %%mm4\n\t" | |
846 "paddusw %%mm3, %%mm1\n\t" | |
847 "paddusw %%mm6, %%mm4\n\t" | |
848 "paddusw %%mm6, %%mm1\n\t" | |
849 "paddusw %%mm4, %%mm0\n\t" | |
850 "paddusw %%mm1, %%mm2\n\t" | |
851 "movq %0, %%mm1\n\t" | |
852 "psrlw $2, %%mm0\n\t" | |
853 "movq %%mm1, %%mm3\n\t" | |
854 "psrlw $2, %%mm2\n\t" | |
855 "punpcklbw %%mm7, %%mm1\n\t" | |
856 "punpckhbw %%mm7, %%mm3\n\t" | |
857 "paddusw %%mm1, %%mm0\n\t" | |
858 "paddusw %%mm3, %%mm2\n\t" | |
859 "psrlw $1, %%mm0\n\t" | |
860 "psrlw $1, %%mm2\n\t" | |
861 "packuswb %%mm2, %%mm0\n\t" | |
862 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
863 :"+m"(*p) |
0 | 864 :"m"(*pix), |
865 "m"(*(pix+line_size)) | |
866 :"memory"); | |
867 pix += line_size; | |
868 p += line_size; | |
869 } while(--h); | |
870 } | |
871 | |
872 static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
873 { | |
874 DCTELEM *p; | |
875 const UINT8 *pix; | |
876 p = block; | |
877 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
878 MOVQ_ZERO(mm7); |
0 | 879 do { |
880 __asm __volatile( | |
881 "movq %0, %%mm0\n\t" | |
882 "movq %1, %%mm2\n\t" | |
883 "movq 8%0, %%mm1\n\t" | |
884 "movq %%mm2, %%mm3\n\t" | |
885 "punpcklbw %%mm7, %%mm2\n\t" | |
886 "punpckhbw %%mm7, %%mm3\n\t" | |
887 "psubsw %%mm2, %%mm0\n\t" | |
888 "psubsw %%mm3, %%mm1\n\t" | |
889 "movq %%mm0, %0\n\t" | |
890 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
891 :"+m"(*p) |
0 | 892 :"m"(*pix) |
893 :"memory"); | |
894 pix += line_size; | |
895 p += 8; | |
896 } while (--h); | |
897 } | |
898 | |
899 static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
900 { | |
901 DCTELEM *p; | |
902 const UINT8 *pix; | |
903 p = block; | |
904 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
905 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
906 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
907 JUMPALIGN(); |
0 | 908 do { |
909 __asm __volatile( | |
910 "movq %0, %%mm0\n\t" | |
911 "movq %1, %%mm2\n\t" | |
912 "movq 8%0, %%mm1\n\t" | |
913 "movq 1%1, %%mm4\n\t" | |
914 "movq %%mm2, %%mm3\n\t" | |
915 "movq %%mm4, %%mm5\n\t" | |
916 "punpcklbw %%mm7, %%mm2\n\t" | |
917 "punpckhbw %%mm7, %%mm3\n\t" | |
918 "punpcklbw %%mm7, %%mm4\n\t" | |
919 "punpckhbw %%mm7, %%mm5\n\t" | |
920 "paddusw %%mm4, %%mm2\n\t" | |
921 "paddusw %%mm5, %%mm3\n\t" | |
922 "paddusw %%mm6, %%mm2\n\t" | |
923 "paddusw %%mm6, %%mm3\n\t" | |
924 "psrlw $1, %%mm2\n\t" | |
925 "psrlw $1, %%mm3\n\t" | |
926 "psubsw %%mm2, %%mm0\n\t" | |
927 "psubsw %%mm3, %%mm1\n\t" | |
928 "movq %%mm0, %0\n\t" | |
929 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
930 :"+m"(*p) |
0 | 931 :"m"(*pix) |
932 :"memory"); | |
933 pix += line_size; | |
934 p += 8; | |
935 } while (--h); | |
936 } | |
937 | |
938 static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
939 { | |
940 DCTELEM *p; | |
941 const UINT8 *pix; | |
942 p = block; | |
943 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
944 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
945 MOVQ_WONE(mm6); |
0 | 946 do { |
947 __asm __volatile( | |
948 "movq %0, %%mm0\n\t" | |
949 "movq %1, %%mm2\n\t" | |
950 "movq 8%0, %%mm1\n\t" | |
951 "movq %2, %%mm4\n\t" | |
952 "movq %%mm2, %%mm3\n\t" | |
953 "movq %%mm4, %%mm5\n\t" | |
954 "punpcklbw %%mm7, %%mm2\n\t" | |
955 "punpckhbw %%mm7, %%mm3\n\t" | |
956 "punpcklbw %%mm7, %%mm4\n\t" | |
957 "punpckhbw %%mm7, %%mm5\n\t" | |
958 "paddusw %%mm4, %%mm2\n\t" | |
959 "paddusw %%mm5, %%mm3\n\t" | |
960 "paddusw %%mm6, %%mm2\n\t" | |
961 "paddusw %%mm6, %%mm3\n\t" | |
962 "psrlw $1, %%mm2\n\t" | |
963 "psrlw $1, %%mm3\n\t" | |
964 "psubsw %%mm2, %%mm0\n\t" | |
965 "psubsw %%mm3, %%mm1\n\t" | |
966 "movq %%mm0, %0\n\t" | |
967 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
968 :"+m"(*p) |
0 | 969 :"m"(*pix), "m"(*(pix+line_size)) |
970 :"memory"); | |
971 pix += line_size; | |
972 p += 8; | |
973 } while (--h); | |
974 } | |
975 | |
976 static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
977 { | |
978 DCTELEM *p; | |
979 const UINT8 *pix; | |
980 p = block; | |
981 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
982 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
983 MOVQ_WTWO(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
984 JUMPALIGN(); |
0 | 985 do { |
986 __asm __volatile( | |
987 "movq %1, %%mm0\n\t" | |
988 "movq %2, %%mm1\n\t" | |
989 "movq 1%1, %%mm4\n\t" | |
990 "movq 1%2, %%mm5\n\t" | |
991 "movq %%mm0, %%mm2\n\t" | |
992 "movq %%mm1, %%mm3\n\t" | |
993 "punpcklbw %%mm7, %%mm0\n\t" | |
994 "punpcklbw %%mm7, %%mm1\n\t" | |
995 "punpckhbw %%mm7, %%mm2\n\t" | |
996 "punpckhbw %%mm7, %%mm3\n\t" | |
997 "paddusw %%mm1, %%mm0\n\t" | |
998 "paddusw %%mm3, %%mm2\n\t" | |
999 "movq %%mm4, %%mm1\n\t" | |
1000 "movq %%mm5, %%mm3\n\t" | |
1001 "punpcklbw %%mm7, %%mm4\n\t" | |
1002 "punpcklbw %%mm7, %%mm5\n\t" | |
1003 "punpckhbw %%mm7, %%mm1\n\t" | |
1004 "punpckhbw %%mm7, %%mm3\n\t" | |
1005 "paddusw %%mm5, %%mm4\n\t" | |
1006 "paddusw %%mm3, %%mm1\n\t" | |
1007 "paddusw %%mm6, %%mm4\n\t" | |
1008 "paddusw %%mm6, %%mm1\n\t" | |
1009 "paddusw %%mm4, %%mm0\n\t" | |
1010 "paddusw %%mm1, %%mm2\n\t" | |
1011 "movq %0, %%mm1\n\t" | |
1012 "movq 8%0, %%mm3\n\t" | |
1013 "psrlw $2, %%mm0\n\t" | |
1014 "psrlw $2, %%mm2\n\t" | |
1015 "psubsw %%mm0, %%mm1\n\t" | |
1016 "psubsw %%mm2, %%mm3\n\t" | |
1017 "movq %%mm1, %0\n\t" | |
1018 "movq %%mm3, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
1019 :"+m"(*p) |
0 | 1020 :"m"(*pix), |
1021 "m"(*(pix+line_size)) | |
1022 :"memory"); | |
1023 pix += line_size; | |
1024 p += 8 ; | |
1025 } while(--h); | |
1026 } | |
1027 | |
296 | 1028 static void clear_blocks_mmx(DCTELEM *blocks) |
1029 { | |
1030 asm volatile( | |
1031 "pxor %%mm7, %%mm7 \n\t" | |
1032 "movl $-128*6, %%eax \n\t" | |
1033 "1: \n\t" | |
1034 "movq %%mm7, (%0, %%eax) \n\t" | |
1035 "movq %%mm7, 8(%0, %%eax) \n\t" | |
1036 "movq %%mm7, 16(%0, %%eax) \n\t" | |
1037 "movq %%mm7, 24(%0, %%eax) \n\t" | |
1038 "addl $32, %%eax \n\t" | |
1039 " js 1b \n\t" | |
1040 : : "r" (((int)blocks)+128*6) | |
1041 : "%eax" | |
1042 ); | |
1043 } | |
1044 | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1045 static void just_return() { return; } |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1046 |
0 | 1047 void dsputil_init_mmx(void) |
1048 { | |
1049 mm_flags = mm_support(); | |
188 | 1050 #if 1 |
1051 printf("libavcodec: CPU flags:"); | |
0 | 1052 if (mm_flags & MM_MMX) |
1053 printf(" mmx"); | |
1054 if (mm_flags & MM_MMXEXT) | |
1055 printf(" mmxext"); | |
1056 if (mm_flags & MM_3DNOW) | |
1057 printf(" 3dnow"); | |
1058 if (mm_flags & MM_SSE) | |
1059 printf(" sse"); | |
1060 if (mm_flags & MM_SSE2) | |
1061 printf(" sse2"); | |
1062 printf("\n"); | |
1063 #endif | |
1064 | |
1065 if (mm_flags & MM_MMX) { | |
1066 get_pixels = get_pixels_mmx; | |
1067 put_pixels_clamped = put_pixels_clamped_mmx; | |
1068 add_pixels_clamped = add_pixels_clamped_mmx; | |
296 | 1069 clear_blocks= clear_blocks_mmx; |
1070 | |
294 | 1071 pix_abs16x16 = pix_abs16x16_mmx; |
1072 pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
1073 pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
0 | 1074 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; |
294 | 1075 pix_abs8x8 = pix_abs8x8_mmx; |
1076 pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | |
1077 pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | |
1078 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | |
0 | 1079 av_fdct = fdct_mmx; |
1080 | |
1081 put_pixels_tab[0] = put_pixels_mmx; | |
1082 put_pixels_tab[1] = put_pixels_x2_mmx; | |
1083 put_pixels_tab[2] = put_pixels_y2_mmx; | |
1084 put_pixels_tab[3] = put_pixels_xy2_mmx; | |
1085 | |
1086 put_no_rnd_pixels_tab[0] = put_pixels_mmx; | |
1087 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
1088 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1089 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | |
1090 | |
1091 avg_pixels_tab[0] = avg_pixels_mmx; | |
1092 avg_pixels_tab[1] = avg_pixels_x2_mmx; | |
1093 avg_pixels_tab[2] = avg_pixels_y2_mmx; | |
1094 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1095 | |
1096 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | |
1097 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | |
1098 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | |
1099 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | |
1100 | |
1101 sub_pixels_tab[0] = sub_pixels_mmx; | |
1102 sub_pixels_tab[1] = sub_pixels_x2_mmx; | |
1103 sub_pixels_tab[2] = sub_pixels_y2_mmx; | |
1104 sub_pixels_tab[3] = sub_pixels_xy2_mmx; | |
1105 | |
1106 if (mm_flags & MM_MMXEXT) { | |
294 | 1107 pix_abs16x16 = pix_abs16x16_mmx2; |
1108 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; | |
1109 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; | |
1110 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2; | |
1111 | |
1112 pix_abs8x8 = pix_abs8x8_mmx2; | |
1113 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; | |
1114 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; | |
1115 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; | |
1116 | |
0 | 1117 put_pixels_tab[1] = put_pixels_x2_sse; |
1118 put_pixels_tab[2] = put_pixels_y2_sse; | |
1119 | |
1120 avg_pixels_tab[0] = avg_pixels_sse; | |
1121 avg_pixels_tab[1] = avg_pixels_x2_sse; | |
1122 avg_pixels_tab[2] = avg_pixels_y2_sse; | |
1123 avg_pixels_tab[3] = avg_pixels_xy2_sse; | |
1124 | |
1125 sub_pixels_tab[1] = sub_pixels_x2_sse; | |
1126 sub_pixels_tab[2] = sub_pixels_y2_sse; | |
1127 } else if (mm_flags & MM_3DNOW) { | |
1128 put_pixels_tab[1] = put_pixels_x2_3dnow; | |
1129 put_pixels_tab[2] = put_pixels_y2_3dnow; | |
1130 | |
1131 avg_pixels_tab[0] = avg_pixels_3dnow; | |
1132 avg_pixels_tab[1] = avg_pixels_x2_3dnow; | |
1133 avg_pixels_tab[2] = avg_pixels_y2_3dnow; | |
1134 avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | |
1135 | |
1136 sub_pixels_tab[1] = sub_pixels_x2_3dnow; | |
1137 sub_pixels_tab[2] = sub_pixels_y2_3dnow; | |
1138 } | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1139 |
42 | 1140 /* idct */ |
1141 if (mm_flags & MM_MMXEXT) { | |
1142 ff_idct = ff_mmxext_idct; | |
1143 } else { | |
1144 ff_idct = ff_mmx_idct; | |
1145 } | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1146 #ifdef SIMPLE_IDCT |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1147 // ff_idct = simple_idct; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1148 ff_idct = simple_idct_mmx; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1149 #endif |
0 | 1150 } |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1151 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1152 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1153 // for speed testing |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1154 get_pixels = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1155 put_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1156 add_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1157 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1158 pix_abs16x16 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1159 pix_abs16x16_x2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1160 pix_abs16x16_y2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1161 pix_abs16x16_xy2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1162 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1163 put_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1164 put_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1165 put_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1166 put_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1167 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1168 put_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1169 put_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1170 put_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1171 put_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1172 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1173 avg_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1174 avg_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1175 avg_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1176 avg_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1177 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1178 avg_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1179 avg_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1180 avg_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1181 avg_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1182 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1183 sub_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1184 sub_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1185 sub_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1186 sub_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1187 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1188 //av_fdct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1189 //ff_idct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1190 #endif |
0 | 1191 } |