Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 256:4c1cec7c3c7c libavcodec
q-pel mc fixed
author | michaelni |
---|---|
date | Sat, 09 Mar 2002 14:22:21 +0000 |
parents | 6f48cacd9ed9 |
children | 944632089814 |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
3 * Copyright (c) 2000, 2001 Gerard Lantau. | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
18 * | |
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 */ | |
21 | |
22 #include "../dsputil.h" | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
23 #include "../simple_idct.h" |
0 | 24 |
5 | 25 int mm_flags; /* multimedia extension flags */ |
26 | |
0 | 27 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); |
28 int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
29 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
30 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
31 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |
32 | |
42 | 33 /* external functions, from idct_mmx.c */ |
34 void ff_mmx_idct(DCTELEM *block); | |
35 void ff_mmxext_idct(DCTELEM *block); | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
36 |
0 | 37 /* pixel operations */ |
8 | 38 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; |
39 static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; | |
40 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; | |
41 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |
0 | 42 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
43 #define JUMPALIGN() __asm __volatile (".balign 8"::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
44 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
45 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
46 #ifndef PIC |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
47 #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
48 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
49 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
50 // for shared library it's better to use this way for accessing constants |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
51 // pcmpeqd -> -1 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
52 #define MOVQ_WONE(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
53 __asm __volatile ( \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
54 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
55 "psrlw $15, %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
56 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
57 #define MOVQ_WTWO(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
58 __asm __volatile ( \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
59 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
60 "psrlw $15, %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
61 "psllw $1, %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
62 #endif |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
63 |
0 | 64 /***********************************/ |
65 /* 3Dnow specific */ | |
66 | |
67 #define DEF(x) x ## _3dnow | |
68 /* for Athlons PAVGUSB is prefered */ | |
69 #define PAVGB "pavgusb" | |
70 | |
71 #include "dsputil_mmx_avg.h" | |
72 | |
73 #undef DEF | |
74 #undef PAVGB | |
75 | |
76 /***********************************/ | |
77 /* MMX2 specific */ | |
78 | |
79 #define DEF(x) x ## _sse | |
80 | |
81 /* Introduced only in MMX2 set */ | |
82 #define PAVGB "pavgb" | |
83 | |
84 #include "dsputil_mmx_avg.h" | |
85 | |
86 #undef DEF | |
87 #undef PAVGB | |
88 | |
89 /***********************************/ | |
90 /* standard MMX */ | |
91 | |
92 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
93 { | |
94 DCTELEM *p; | |
95 const UINT8 *pix; | |
96 int i; | |
97 | |
98 /* read the pixels */ | |
99 p = block; | |
100 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
101 MOVQ_ZERO(mm7); |
0 | 102 for(i=0;i<4;i++) { |
103 __asm __volatile( | |
104 "movq %1, %%mm0\n\t" | |
105 "movq %2, %%mm1\n\t" | |
106 "movq %%mm0, %%mm2\n\t" | |
107 "movq %%mm1, %%mm3\n\t" | |
108 "punpcklbw %%mm7, %%mm0\n\t" | |
109 "punpckhbw %%mm7, %%mm2\n\t" | |
110 "punpcklbw %%mm7, %%mm1\n\t" | |
111 "punpckhbw %%mm7, %%mm3\n\t" | |
112 "movq %%mm0, %0\n\t" | |
113 "movq %%mm2, 8%0\n\t" | |
114 "movq %%mm1, 16%0\n\t" | |
115 "movq %%mm3, 24%0\n\t" | |
116 :"=m"(*p) | |
117 :"m"(*pix), "m"(*(pix+line_size)) | |
118 :"memory"); | |
119 pix += line_size*2; | |
120 p += 16; | |
121 } | |
122 } | |
123 | |
124 static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
125 { | |
126 const DCTELEM *p; | |
127 UINT8 *pix; | |
128 | |
129 /* read the pixels */ | |
130 p = block; | |
131 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
132 /* unrolled loop */ |
0 | 133 __asm __volatile( |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
134 "movq %3, %%mm0\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
135 "movq 8%3, %%mm1\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
136 "movq 16%3, %%mm2\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
137 "movq 24%3, %%mm3\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
138 "movq 32%3, %%mm4\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
139 "movq 40%3, %%mm5\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
140 "movq 48%3, %%mm6\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
141 "movq 56%3, %%mm7\n\t" |
0 | 142 "packuswb %%mm1, %%mm0\n\t" |
143 "packuswb %%mm3, %%mm2\n\t" | |
144 "packuswb %%mm5, %%mm4\n\t" | |
145 "packuswb %%mm7, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
146 "movq %%mm0, (%0)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
147 "movq %%mm2, (%0, %1)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
148 "movq %%mm4, (%0, %1, 2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
149 "movq %%mm6, (%0, %2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
150 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) |
0 | 151 :"memory"); |
152 pix += line_size*4; | |
153 p += 32; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
154 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
155 // if here would be an exact copy of the code above |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
156 // compiler would generate some very strange code |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
157 // thus using "r" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
158 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
159 "movq (%3), %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
160 "movq 8(%3), %%mm1\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
161 "movq 16(%3), %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
162 "movq 24(%3), %%mm3\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
163 "movq 32(%3), %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
164 "movq 40(%3), %%mm5\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
165 "movq 48(%3), %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
166 "movq 56(%3), %%mm7\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
167 "packuswb %%mm1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
168 "packuswb %%mm3, %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
169 "packuswb %%mm5, %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
170 "packuswb %%mm7, %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
171 "movq %%mm0, (%0)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
172 "movq %%mm2, (%0, %1)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
173 "movq %%mm4, (%0, %1, 2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
174 "movq %%mm6, (%0, %2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
175 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
176 :"memory"); |
0 | 177 } |
178 | |
179 static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
180 { | |
181 const DCTELEM *p; | |
182 UINT8 *pix; | |
183 int i; | |
184 | |
185 /* read the pixels */ | |
186 p = block; | |
187 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
188 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
189 i = 4; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
190 while (i) { |
0 | 191 __asm __volatile( |
192 "movq %2, %%mm0\n\t" | |
193 "movq 8%2, %%mm1\n\t" | |
194 "movq 16%2, %%mm2\n\t" | |
195 "movq 24%2, %%mm3\n\t" | |
196 "movq %0, %%mm4\n\t" | |
197 "movq %1, %%mm6\n\t" | |
198 "movq %%mm4, %%mm5\n\t" | |
199 "punpcklbw %%mm7, %%mm4\n\t" | |
200 "punpckhbw %%mm7, %%mm5\n\t" | |
201 "paddsw %%mm4, %%mm0\n\t" | |
202 "paddsw %%mm5, %%mm1\n\t" | |
203 "movq %%mm6, %%mm5\n\t" | |
204 "punpcklbw %%mm7, %%mm6\n\t" | |
205 "punpckhbw %%mm7, %%mm5\n\t" | |
206 "paddsw %%mm6, %%mm2\n\t" | |
207 "paddsw %%mm5, %%mm3\n\t" | |
208 "packuswb %%mm1, %%mm0\n\t" | |
209 "packuswb %%mm3, %%mm2\n\t" | |
210 "movq %%mm0, %0\n\t" | |
211 "movq %%mm2, %1\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
212 :"+m"(*pix), "+m"(*(pix+line_size)) |
0 | 213 :"m"(*p) |
214 :"memory"); | |
215 pix += line_size*2; | |
216 p += 16; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
217 i--; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
218 }; |
0 | 219 } |
220 | |
221 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
222 { | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
223 int hh; |
0 | 224 UINT8 *p; |
225 const UINT8 *pix; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
226 |
0 | 227 p = block; |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
228 pix = pixels; // 2s |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
229 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
230 do { |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
231 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
232 "movq %1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
233 "movq %%mm0, %0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
234 :"=m"(*p) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
235 :"m"(*pix) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
236 :"memory"); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
237 pix += line_size; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
238 p += line_size; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
239 } while (--h); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
240 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
241 // this optimized code is not very usefull |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
242 // the above loop is definitely faster |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
243 // at least on Celeron 500MHz |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
244 hh = h & 3; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
245 while (hh) { |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
246 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
247 "movq %1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
248 "movq %%mm0, %0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
249 :"=m"(*p) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
250 :"m"(*pix) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
251 :"memory"); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
252 pix += line_size; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
253 p += line_size; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
254 hh--; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
255 } |
0 | 256 hh=h>>2; |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
257 while (hh) { |
0 | 258 __asm __volatile( |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
259 "movq (%1), %%mm0 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
260 "movq (%1, %2), %%mm1 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
261 "movq (%1, %2, 2), %%mm2 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
262 "movq (%1, %3), %%mm3 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
263 "movq %%mm0, (%0) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
264 "movq %%mm1, (%0, %2) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
265 "movq %%mm2, (%0, %2, 2) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
266 "movq %%mm3, (%0, %3) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
267 ::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3) |
0 | 268 :"memory"); |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
269 pix += line_size*4; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
270 p += line_size*4; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
271 hh--; |
0 | 272 } |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
273 #endif |
0 | 274 } |
275 | |
276 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
277 { | |
278 UINT8 *p; | |
279 const UINT8 *pix; | |
280 p = block; | |
281 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
282 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
283 MOVQ_WONE(mm4); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
284 JUMPALIGN(); |
0 | 285 do { |
286 __asm __volatile( | |
287 "movq %1, %%mm0\n\t" | |
288 "movq 1%1, %%mm1\n\t" | |
289 "movq %%mm0, %%mm2\n\t" | |
290 "movq %%mm1, %%mm3\n\t" | |
291 "punpcklbw %%mm7, %%mm0\n\t" | |
292 "punpcklbw %%mm7, %%mm1\n\t" | |
293 "punpckhbw %%mm7, %%mm2\n\t" | |
294 "punpckhbw %%mm7, %%mm3\n\t" | |
295 "paddusw %%mm1, %%mm0\n\t" | |
296 "paddusw %%mm3, %%mm2\n\t" | |
297 "paddusw %%mm4, %%mm0\n\t" | |
298 "paddusw %%mm4, %%mm2\n\t" | |
299 "psrlw $1, %%mm0\n\t" | |
300 "psrlw $1, %%mm2\n\t" | |
301 "packuswb %%mm2, %%mm0\n\t" | |
302 "movq %%mm0, %0\n\t" | |
303 :"=m"(*p) | |
304 :"m"(*pix) | |
305 :"memory"); | |
306 pix += line_size; p += line_size; | |
307 } while (--h); | |
308 } | |
309 | |
310 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
311 { | |
312 UINT8 *p; | |
313 const UINT8 *pix; | |
314 p = block; | |
315 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
316 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
317 MOVQ_WONE(mm4); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
318 JUMPALIGN(); |
0 | 319 do { |
320 __asm __volatile( | |
321 "movq %1, %%mm0\n\t" | |
322 "movq %2, %%mm1\n\t" | |
323 "movq %%mm0, %%mm2\n\t" | |
324 "movq %%mm1, %%mm3\n\t" | |
325 "punpcklbw %%mm7, %%mm0\n\t" | |
326 "punpcklbw %%mm7, %%mm1\n\t" | |
327 "punpckhbw %%mm7, %%mm2\n\t" | |
328 "punpckhbw %%mm7, %%mm3\n\t" | |
329 "paddusw %%mm1, %%mm0\n\t" | |
330 "paddusw %%mm3, %%mm2\n\t" | |
331 "paddusw %%mm4, %%mm0\n\t" | |
332 "paddusw %%mm4, %%mm2\n\t" | |
333 "psrlw $1, %%mm0\n\t" | |
334 "psrlw $1, %%mm2\n\t" | |
335 "packuswb %%mm2, %%mm0\n\t" | |
336 "movq %%mm0, %0\n\t" | |
337 :"=m"(*p) | |
338 :"m"(*pix), | |
339 "m"(*(pix+line_size)) | |
340 :"memory"); | |
341 pix += line_size; | |
342 p += line_size; | |
343 } while (--h); | |
344 } | |
345 | |
346 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
347 { | |
348 UINT8 *p; | |
349 const UINT8 *pix; | |
350 p = block; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
351 pix = pixels; // 1s |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
352 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
353 MOVQ_WTWO(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
354 JUMPALIGN(); |
0 | 355 do { |
356 __asm __volatile( | |
357 "movq %1, %%mm0\n\t" | |
358 "movq %2, %%mm1\n\t" | |
359 "movq 1%1, %%mm4\n\t" | |
360 "movq 1%2, %%mm5\n\t" | |
361 "movq %%mm0, %%mm2\n\t" | |
362 "movq %%mm1, %%mm3\n\t" | |
363 "punpcklbw %%mm7, %%mm0\n\t" | |
364 "punpcklbw %%mm7, %%mm1\n\t" | |
365 "punpckhbw %%mm7, %%mm2\n\t" | |
366 "punpckhbw %%mm7, %%mm3\n\t" | |
367 "paddusw %%mm1, %%mm0\n\t" | |
368 "paddusw %%mm3, %%mm2\n\t" | |
369 "movq %%mm4, %%mm1\n\t" | |
370 "movq %%mm5, %%mm3\n\t" | |
371 "punpcklbw %%mm7, %%mm4\n\t" | |
372 "punpcklbw %%mm7, %%mm5\n\t" | |
373 "punpckhbw %%mm7, %%mm1\n\t" | |
374 "punpckhbw %%mm7, %%mm3\n\t" | |
375 "paddusw %%mm5, %%mm4\n\t" | |
376 "paddusw %%mm3, %%mm1\n\t" | |
377 "paddusw %%mm6, %%mm4\n\t" | |
378 "paddusw %%mm6, %%mm1\n\t" | |
379 "paddusw %%mm4, %%mm0\n\t" | |
380 "paddusw %%mm1, %%mm2\n\t" | |
381 "psrlw $2, %%mm0\n\t" | |
382 "psrlw $2, %%mm2\n\t" | |
383 "packuswb %%mm2, %%mm0\n\t" | |
384 "movq %%mm0, %0\n\t" | |
385 :"=m"(*p) | |
386 :"m"(*pix), | |
387 "m"(*(pix+line_size)) | |
388 :"memory"); | |
389 pix += line_size; | |
390 p += line_size; | |
391 } while(--h); | |
392 } | |
393 | |
394 static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
395 { | |
396 UINT8 *p; | |
397 const UINT8 *pix; | |
398 p = block; | |
399 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
400 MOVQ_ZERO(mm7); |
0 | 401 do { |
402 __asm __volatile( | |
403 "movq %1, %%mm0\n\t" | |
404 "movq 1%1, %%mm1\n\t" | |
405 "movq %%mm0, %%mm2\n\t" | |
406 "movq %%mm1, %%mm3\n\t" | |
407 "punpcklbw %%mm7, %%mm0\n\t" | |
408 "punpcklbw %%mm7, %%mm1\n\t" | |
409 "punpckhbw %%mm7, %%mm2\n\t" | |
410 "punpckhbw %%mm7, %%mm3\n\t" | |
411 "paddusw %%mm1, %%mm0\n\t" | |
412 "paddusw %%mm3, %%mm2\n\t" | |
413 "psrlw $1, %%mm0\n\t" | |
414 "psrlw $1, %%mm2\n\t" | |
415 "packuswb %%mm2, %%mm0\n\t" | |
416 "movq %%mm0, %0\n\t" | |
417 :"=m"(*p) | |
418 :"m"(*pix) | |
419 :"memory"); | |
420 pix += line_size; | |
421 p += line_size; | |
422 } while (--h); | |
423 } | |
424 | |
425 static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
426 { | |
427 UINT8 *p; | |
428 const UINT8 *pix; | |
429 p = block; | |
430 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
431 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
432 JUMPALIGN(); |
0 | 433 do { |
434 __asm __volatile( | |
435 "movq %1, %%mm0\n\t" | |
436 "movq %2, %%mm1\n\t" | |
437 "movq %%mm0, %%mm2\n\t" | |
438 "movq %%mm1, %%mm3\n\t" | |
439 "punpcklbw %%mm7, %%mm0\n\t" | |
440 "punpcklbw %%mm7, %%mm1\n\t" | |
441 "punpckhbw %%mm7, %%mm2\n\t" | |
442 "punpckhbw %%mm7, %%mm3\n\t" | |
443 "paddusw %%mm1, %%mm0\n\t" | |
444 "paddusw %%mm3, %%mm2\n\t" | |
445 "psrlw $1, %%mm0\n\t" | |
446 "psrlw $1, %%mm2\n\t" | |
447 "packuswb %%mm2, %%mm0\n\t" | |
448 "movq %%mm0, %0\n\t" | |
449 :"=m"(*p) | |
450 :"m"(*pix), | |
451 "m"(*(pix+line_size)) | |
452 :"memory"); | |
453 pix += line_size; | |
454 p += line_size; | |
455 } while(--h); | |
456 } | |
457 | |
458 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
459 { | |
460 UINT8 *p; | |
461 const UINT8 *pix; | |
462 p = block; | |
463 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
464 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
465 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
466 JUMPALIGN(); |
0 | 467 do { |
468 __asm __volatile( | |
469 "movq %1, %%mm0\n\t" | |
470 "movq %2, %%mm1\n\t" | |
471 "movq 1%1, %%mm4\n\t" | |
472 "movq 1%2, %%mm5\n\t" | |
473 "movq %%mm0, %%mm2\n\t" | |
474 "movq %%mm1, %%mm3\n\t" | |
475 "punpcklbw %%mm7, %%mm0\n\t" | |
476 "punpcklbw %%mm7, %%mm1\n\t" | |
477 "punpckhbw %%mm7, %%mm2\n\t" | |
478 "punpckhbw %%mm7, %%mm3\n\t" | |
479 "paddusw %%mm1, %%mm0\n\t" | |
480 "paddusw %%mm3, %%mm2\n\t" | |
481 "movq %%mm4, %%mm1\n\t" | |
482 "movq %%mm5, %%mm3\n\t" | |
483 "punpcklbw %%mm7, %%mm4\n\t" | |
484 "punpcklbw %%mm7, %%mm5\n\t" | |
485 "punpckhbw %%mm7, %%mm1\n\t" | |
486 "punpckhbw %%mm7, %%mm3\n\t" | |
487 "paddusw %%mm5, %%mm4\n\t" | |
488 "paddusw %%mm3, %%mm1\n\t" | |
489 "paddusw %%mm6, %%mm4\n\t" | |
490 "paddusw %%mm6, %%mm1\n\t" | |
491 "paddusw %%mm4, %%mm0\n\t" | |
492 "paddusw %%mm1, %%mm2\n\t" | |
493 "psrlw $2, %%mm0\n\t" | |
494 "psrlw $2, %%mm2\n\t" | |
495 "packuswb %%mm2, %%mm0\n\t" | |
496 "movq %%mm0, %0\n\t" | |
497 :"=m"(*p) | |
498 :"m"(*pix), | |
499 "m"(*(pix+line_size)) | |
500 :"memory"); | |
501 pix += line_size; | |
502 p += line_size; | |
503 } while(--h); | |
504 } | |
505 | |
506 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
507 { | |
508 UINT8 *p; | |
509 const UINT8 *pix; | |
510 p = block; | |
511 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
512 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
513 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
514 JUMPALIGN(); |
0 | 515 do { |
516 __asm __volatile( | |
517 "movq %0, %%mm0\n\t" | |
518 "movq %1, %%mm1\n\t" | |
519 "movq %%mm0, %%mm2\n\t" | |
520 "movq %%mm1, %%mm3\n\t" | |
521 "punpcklbw %%mm7, %%mm0\n\t" | |
522 "punpcklbw %%mm7, %%mm1\n\t" | |
523 "punpckhbw %%mm7, %%mm2\n\t" | |
524 "punpckhbw %%mm7, %%mm3\n\t" | |
525 "paddusw %%mm1, %%mm0\n\t" | |
526 "paddusw %%mm3, %%mm2\n\t" | |
527 "paddusw %%mm6, %%mm0\n\t" | |
528 "paddusw %%mm6, %%mm2\n\t" | |
529 "psrlw $1, %%mm0\n\t" | |
530 "psrlw $1, %%mm2\n\t" | |
531 "packuswb %%mm2, %%mm0\n\t" | |
532 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
533 :"+m"(*p) |
0 | 534 :"m"(*pix) |
535 :"memory"); | |
536 pix += line_size; | |
537 p += line_size; | |
538 } | |
539 while (--h); | |
540 } | |
541 | |
542 static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
543 { | |
544 UINT8 *p; | |
545 const UINT8 *pix; | |
546 p = block; | |
547 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
548 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
549 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
550 JUMPALIGN(); |
0 | 551 do { |
552 __asm __volatile( | |
553 "movq %1, %%mm1\n\t" | |
554 "movq %0, %%mm0\n\t" | |
555 "movq 1%1, %%mm4\n\t" | |
556 "movq %%mm0, %%mm2\n\t" | |
557 "movq %%mm1, %%mm3\n\t" | |
558 "movq %%mm4, %%mm5\n\t" | |
559 "punpcklbw %%mm7, %%mm1\n\t" | |
560 "punpckhbw %%mm7, %%mm3\n\t" | |
561 "punpcklbw %%mm7, %%mm4\n\t" | |
562 "punpckhbw %%mm7, %%mm5\n\t" | |
563 "punpcklbw %%mm7, %%mm0\n\t" | |
564 "punpckhbw %%mm7, %%mm2\n\t" | |
565 "paddusw %%mm4, %%mm1\n\t" | |
566 "paddusw %%mm5, %%mm3\n\t" | |
567 "paddusw %%mm6, %%mm1\n\t" | |
568 "paddusw %%mm6, %%mm3\n\t" | |
569 "psrlw $1, %%mm1\n\t" | |
570 "psrlw $1, %%mm3\n\t" | |
571 "paddusw %%mm6, %%mm0\n\t" | |
572 "paddusw %%mm6, %%mm2\n\t" | |
573 "paddusw %%mm1, %%mm0\n\t" | |
574 "paddusw %%mm3, %%mm2\n\t" | |
575 "psrlw $1, %%mm0\n\t" | |
576 "psrlw $1, %%mm2\n\t" | |
577 "packuswb %%mm2, %%mm0\n\t" | |
578 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
579 :"+m"(*p) |
0 | 580 :"m"(*pix) |
581 :"memory"); | |
582 pix += line_size; | |
583 p += line_size; | |
584 } while (--h); | |
585 } | |
586 | |
587 static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
588 { | |
589 UINT8 *p; | |
590 const UINT8 *pix; | |
591 p = block; | |
592 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
593 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
594 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
595 JUMPALIGN(); |
0 | 596 do { |
597 __asm __volatile( | |
598 "movq %1, %%mm1\n\t" | |
599 "movq %0, %%mm0\n\t" | |
600 "movq %2, %%mm4\n\t" | |
601 "movq %%mm0, %%mm2\n\t" | |
602 "movq %%mm1, %%mm3\n\t" | |
603 "movq %%mm4, %%mm5\n\t" | |
604 "punpcklbw %%mm7, %%mm1\n\t" | |
605 "punpckhbw %%mm7, %%mm3\n\t" | |
606 "punpcklbw %%mm7, %%mm4\n\t" | |
607 "punpckhbw %%mm7, %%mm5\n\t" | |
608 "punpcklbw %%mm7, %%mm0\n\t" | |
609 "punpckhbw %%mm7, %%mm2\n\t" | |
610 "paddusw %%mm4, %%mm1\n\t" | |
611 "paddusw %%mm5, %%mm3\n\t" | |
612 "paddusw %%mm6, %%mm1\n\t" | |
613 "paddusw %%mm6, %%mm3\n\t" | |
614 "psrlw $1, %%mm1\n\t" | |
615 "psrlw $1, %%mm3\n\t" | |
616 "paddusw %%mm6, %%mm0\n\t" | |
617 "paddusw %%mm6, %%mm2\n\t" | |
618 "paddusw %%mm1, %%mm0\n\t" | |
619 "paddusw %%mm3, %%mm2\n\t" | |
620 "psrlw $1, %%mm0\n\t" | |
621 "psrlw $1, %%mm2\n\t" | |
622 "packuswb %%mm2, %%mm0\n\t" | |
623 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
624 :"+m"(*p) |
0 | 625 :"m"(*pix), "m"(*(pix+line_size)) |
626 :"memory"); | |
627 pix += line_size; | |
628 p += line_size ; | |
629 } while(--h); | |
630 } | |
631 | |
632 static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
633 { | |
634 UINT8 *p; | |
635 const UINT8 *pix; | |
636 p = block; | |
637 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
638 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
639 // this doesn't seem to be used offten - so |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
640 // the inside usage of mm_wone is not optimized |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
641 MOVQ_WTWO(mm6); |
0 | 642 do { |
643 __asm __volatile( | |
644 "movq %1, %%mm0\n\t" | |
645 "movq %2, %%mm1\n\t" | |
646 "movq 1%1, %%mm4\n\t" | |
647 "movq 1%2, %%mm5\n\t" | |
648 "movq %%mm0, %%mm2\n\t" | |
649 "movq %%mm1, %%mm3\n\t" | |
650 "punpcklbw %%mm7, %%mm0\n\t" | |
651 "punpcklbw %%mm7, %%mm1\n\t" | |
652 "punpckhbw %%mm7, %%mm2\n\t" | |
653 "punpckhbw %%mm7, %%mm3\n\t" | |
654 "paddusw %%mm1, %%mm0\n\t" | |
655 "paddusw %%mm3, %%mm2\n\t" | |
656 "movq %%mm4, %%mm1\n\t" | |
657 "movq %%mm5, %%mm3\n\t" | |
658 "punpcklbw %%mm7, %%mm4\n\t" | |
659 "punpcklbw %%mm7, %%mm5\n\t" | |
660 "punpckhbw %%mm7, %%mm1\n\t" | |
661 "punpckhbw %%mm7, %%mm3\n\t" | |
662 "paddusw %%mm5, %%mm4\n\t" | |
663 "paddusw %%mm3, %%mm1\n\t" | |
664 "paddusw %%mm6, %%mm4\n\t" | |
665 "paddusw %%mm6, %%mm1\n\t" | |
666 "paddusw %%mm4, %%mm0\n\t" | |
667 "paddusw %%mm1, %%mm2\n\t" | |
668 "movq %3, %%mm5\n\t" | |
669 "psrlw $2, %%mm0\n\t" | |
670 "movq %0, %%mm1\n\t" | |
671 "psrlw $2, %%mm2\n\t" | |
672 "movq %%mm1, %%mm3\n\t" | |
673 "punpcklbw %%mm7, %%mm1\n\t" | |
674 "punpckhbw %%mm7, %%mm3\n\t" | |
675 "paddusw %%mm1, %%mm0\n\t" | |
676 "paddusw %%mm3, %%mm2\n\t" | |
677 "paddusw %%mm5, %%mm0\n\t" | |
678 "paddusw %%mm5, %%mm2\n\t" | |
679 "psrlw $1, %%mm0\n\t" | |
680 "psrlw $1, %%mm2\n\t" | |
681 "packuswb %%mm2, %%mm0\n\t" | |
682 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
683 :"+m"(*p) |
0 | 684 :"m"(*pix), |
8 | 685 "m"(*(pix+line_size)), "m"(mm_wone) |
0 | 686 :"memory"); |
687 pix += line_size; | |
688 p += line_size ; | |
689 } while(--h); | |
690 } | |
691 | |
692 static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
693 { | |
694 UINT8 *p; | |
695 const UINT8 *pix; | |
696 p = block; | |
697 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
698 MOVQ_ZERO(mm7); |
0 | 699 do { |
700 __asm __volatile( | |
701 "movq %1, %%mm0\n\t" | |
702 "movq %0, %%mm1\n\t" | |
703 "movq %%mm0, %%mm2\n\t" | |
704 "movq %%mm1, %%mm3\n\t" | |
705 "punpcklbw %%mm7, %%mm0\n\t" | |
706 "punpcklbw %%mm7, %%mm1\n\t" | |
707 "punpckhbw %%mm7, %%mm2\n\t" | |
708 "punpckhbw %%mm7, %%mm3\n\t" | |
709 "paddusw %%mm1, %%mm0\n\t" | |
710 "paddusw %%mm3, %%mm2\n\t" | |
711 "psrlw $1, %%mm0\n\t" | |
712 "psrlw $1, %%mm2\n\t" | |
713 "packuswb %%mm2, %%mm0\n\t" | |
714 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
715 :"+m"(*p) |
0 | 716 :"m"(*pix) |
717 :"memory"); | |
718 pix += line_size; | |
719 p += line_size ; | |
720 } while (--h); | |
721 } | |
722 | |
723 static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
724 { | |
725 UINT8 *p; | |
726 const UINT8 *pix; | |
727 p = block; | |
728 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
729 MOVQ_ZERO(mm7); |
0 | 730 do { |
731 __asm __volatile( | |
732 "movq %1, %%mm0\n\t" | |
733 "movq 1%1, %%mm1\n\t" | |
734 "movq %0, %%mm4\n\t" | |
735 "movq %%mm0, %%mm2\n\t" | |
736 "movq %%mm1, %%mm3\n\t" | |
737 "movq %%mm4, %%mm5\n\t" | |
738 "punpcklbw %%mm7, %%mm0\n\t" | |
739 "punpcklbw %%mm7, %%mm1\n\t" | |
740 "punpckhbw %%mm7, %%mm2\n\t" | |
741 "punpckhbw %%mm7, %%mm3\n\t" | |
742 "punpcklbw %%mm7, %%mm4\n\t" | |
743 "punpckhbw %%mm7, %%mm5\n\t" | |
744 "paddusw %%mm1, %%mm0\n\t" | |
745 "paddusw %%mm3, %%mm2\n\t" | |
746 "psrlw $1, %%mm0\n\t" | |
747 "psrlw $1, %%mm2\n\t" | |
748 "paddusw %%mm4, %%mm0\n\t" | |
749 "paddusw %%mm5, %%mm2\n\t" | |
750 "psrlw $1, %%mm0\n\t" | |
751 "psrlw $1, %%mm2\n\t" | |
752 "packuswb %%mm2, %%mm0\n\t" | |
753 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
754 :"+m"(*p) |
0 | 755 :"m"(*pix) |
756 :"memory"); | |
757 pix += line_size; | |
758 p += line_size; | |
759 } while (--h); | |
760 } | |
761 | |
762 static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
763 { | |
764 UINT8 *p; | |
765 const UINT8 *pix; | |
766 p = block; | |
767 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
768 MOVQ_ZERO(mm7); |
0 | 769 do { |
770 __asm __volatile( | |
771 "movq %1, %%mm0\n\t" | |
772 "movq %2, %%mm1\n\t" | |
773 "movq %0, %%mm4\n\t" | |
774 "movq %%mm0, %%mm2\n\t" | |
775 "movq %%mm1, %%mm3\n\t" | |
776 "movq %%mm4, %%mm5\n\t" | |
777 "punpcklbw %%mm7, %%mm0\n\t" | |
778 "punpcklbw %%mm7, %%mm1\n\t" | |
779 "punpckhbw %%mm7, %%mm2\n\t" | |
780 "punpckhbw %%mm7, %%mm3\n\t" | |
781 "punpcklbw %%mm7, %%mm4\n\t" | |
782 "punpckhbw %%mm7, %%mm5\n\t" | |
783 "paddusw %%mm1, %%mm0\n\t" | |
784 "paddusw %%mm3, %%mm2\n\t" | |
785 "psrlw $1, %%mm0\n\t" | |
786 "psrlw $1, %%mm2\n\t" | |
787 "paddusw %%mm4, %%mm0\n\t" | |
788 "paddusw %%mm5, %%mm2\n\t" | |
789 "psrlw $1, %%mm0\n\t" | |
790 "psrlw $1, %%mm2\n\t" | |
791 "packuswb %%mm2, %%mm0\n\t" | |
792 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
793 :"+m"(*p) |
0 | 794 :"m"(*pix), "m"(*(pix+line_size)) |
795 :"memory"); | |
796 pix += line_size; | |
797 p += line_size ; | |
798 } while(--h); | |
799 } | |
800 | |
801 static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
802 { | |
803 UINT8 *p; | |
804 const UINT8 *pix; | |
805 p = block; | |
806 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
807 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
808 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
809 JUMPALIGN(); |
0 | 810 do { |
811 __asm __volatile( | |
812 "movq %1, %%mm0\n\t" | |
813 "movq %2, %%mm1\n\t" | |
814 "movq 1%1, %%mm4\n\t" | |
815 "movq 1%2, %%mm5\n\t" | |
816 "movq %%mm0, %%mm2\n\t" | |
817 "movq %%mm1, %%mm3\n\t" | |
818 "punpcklbw %%mm7, %%mm0\n\t" | |
819 "punpcklbw %%mm7, %%mm1\n\t" | |
820 "punpckhbw %%mm7, %%mm2\n\t" | |
821 "punpckhbw %%mm7, %%mm3\n\t" | |
822 "paddusw %%mm1, %%mm0\n\t" | |
823 "paddusw %%mm3, %%mm2\n\t" | |
824 "movq %%mm4, %%mm1\n\t" | |
825 "movq %%mm5, %%mm3\n\t" | |
826 "punpcklbw %%mm7, %%mm4\n\t" | |
827 "punpcklbw %%mm7, %%mm5\n\t" | |
828 "punpckhbw %%mm7, %%mm1\n\t" | |
829 "punpckhbw %%mm7, %%mm3\n\t" | |
830 "paddusw %%mm5, %%mm4\n\t" | |
831 "paddusw %%mm3, %%mm1\n\t" | |
832 "paddusw %%mm6, %%mm4\n\t" | |
833 "paddusw %%mm6, %%mm1\n\t" | |
834 "paddusw %%mm4, %%mm0\n\t" | |
835 "paddusw %%mm1, %%mm2\n\t" | |
836 "movq %0, %%mm1\n\t" | |
837 "psrlw $2, %%mm0\n\t" | |
838 "movq %%mm1, %%mm3\n\t" | |
839 "psrlw $2, %%mm2\n\t" | |
840 "punpcklbw %%mm7, %%mm1\n\t" | |
841 "punpckhbw %%mm7, %%mm3\n\t" | |
842 "paddusw %%mm1, %%mm0\n\t" | |
843 "paddusw %%mm3, %%mm2\n\t" | |
844 "psrlw $1, %%mm0\n\t" | |
845 "psrlw $1, %%mm2\n\t" | |
846 "packuswb %%mm2, %%mm0\n\t" | |
847 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
848 :"+m"(*p) |
0 | 849 :"m"(*pix), |
850 "m"(*(pix+line_size)) | |
851 :"memory"); | |
852 pix += line_size; | |
853 p += line_size; | |
854 } while(--h); | |
855 } | |
856 | |
857 static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
858 { | |
859 DCTELEM *p; | |
860 const UINT8 *pix; | |
861 p = block; | |
862 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
863 MOVQ_ZERO(mm7); |
0 | 864 do { |
865 __asm __volatile( | |
866 "movq %0, %%mm0\n\t" | |
867 "movq %1, %%mm2\n\t" | |
868 "movq 8%0, %%mm1\n\t" | |
869 "movq %%mm2, %%mm3\n\t" | |
870 "punpcklbw %%mm7, %%mm2\n\t" | |
871 "punpckhbw %%mm7, %%mm3\n\t" | |
872 "psubsw %%mm2, %%mm0\n\t" | |
873 "psubsw %%mm3, %%mm1\n\t" | |
874 "movq %%mm0, %0\n\t" | |
875 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
876 :"+m"(*p) |
0 | 877 :"m"(*pix) |
878 :"memory"); | |
879 pix += line_size; | |
880 p += 8; | |
881 } while (--h); | |
882 } | |
883 | |
884 static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
885 { | |
886 DCTELEM *p; | |
887 const UINT8 *pix; | |
888 p = block; | |
889 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
890 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
891 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
892 JUMPALIGN(); |
0 | 893 do { |
894 __asm __volatile( | |
895 "movq %0, %%mm0\n\t" | |
896 "movq %1, %%mm2\n\t" | |
897 "movq 8%0, %%mm1\n\t" | |
898 "movq 1%1, %%mm4\n\t" | |
899 "movq %%mm2, %%mm3\n\t" | |
900 "movq %%mm4, %%mm5\n\t" | |
901 "punpcklbw %%mm7, %%mm2\n\t" | |
902 "punpckhbw %%mm7, %%mm3\n\t" | |
903 "punpcklbw %%mm7, %%mm4\n\t" | |
904 "punpckhbw %%mm7, %%mm5\n\t" | |
905 "paddusw %%mm4, %%mm2\n\t" | |
906 "paddusw %%mm5, %%mm3\n\t" | |
907 "paddusw %%mm6, %%mm2\n\t" | |
908 "paddusw %%mm6, %%mm3\n\t" | |
909 "psrlw $1, %%mm2\n\t" | |
910 "psrlw $1, %%mm3\n\t" | |
911 "psubsw %%mm2, %%mm0\n\t" | |
912 "psubsw %%mm3, %%mm1\n\t" | |
913 "movq %%mm0, %0\n\t" | |
914 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
915 :"+m"(*p) |
0 | 916 :"m"(*pix) |
917 :"memory"); | |
918 pix += line_size; | |
919 p += 8; | |
920 } while (--h); | |
921 } | |
922 | |
923 static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
924 { | |
925 DCTELEM *p; | |
926 const UINT8 *pix; | |
927 p = block; | |
928 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
929 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
930 MOVQ_WONE(mm6); |
0 | 931 do { |
932 __asm __volatile( | |
933 "movq %0, %%mm0\n\t" | |
934 "movq %1, %%mm2\n\t" | |
935 "movq 8%0, %%mm1\n\t" | |
936 "movq %2, %%mm4\n\t" | |
937 "movq %%mm2, %%mm3\n\t" | |
938 "movq %%mm4, %%mm5\n\t" | |
939 "punpcklbw %%mm7, %%mm2\n\t" | |
940 "punpckhbw %%mm7, %%mm3\n\t" | |
941 "punpcklbw %%mm7, %%mm4\n\t" | |
942 "punpckhbw %%mm7, %%mm5\n\t" | |
943 "paddusw %%mm4, %%mm2\n\t" | |
944 "paddusw %%mm5, %%mm3\n\t" | |
945 "paddusw %%mm6, %%mm2\n\t" | |
946 "paddusw %%mm6, %%mm3\n\t" | |
947 "psrlw $1, %%mm2\n\t" | |
948 "psrlw $1, %%mm3\n\t" | |
949 "psubsw %%mm2, %%mm0\n\t" | |
950 "psubsw %%mm3, %%mm1\n\t" | |
951 "movq %%mm0, %0\n\t" | |
952 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
953 :"+m"(*p) |
0 | 954 :"m"(*pix), "m"(*(pix+line_size)) |
955 :"memory"); | |
956 pix += line_size; | |
957 p += 8; | |
958 } while (--h); | |
959 } | |
960 | |
961 static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
962 { | |
963 DCTELEM *p; | |
964 const UINT8 *pix; | |
965 p = block; | |
966 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
967 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
968 MOVQ_WTWO(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
969 JUMPALIGN(); |
0 | 970 do { |
971 __asm __volatile( | |
972 "movq %1, %%mm0\n\t" | |
973 "movq %2, %%mm1\n\t" | |
974 "movq 1%1, %%mm4\n\t" | |
975 "movq 1%2, %%mm5\n\t" | |
976 "movq %%mm0, %%mm2\n\t" | |
977 "movq %%mm1, %%mm3\n\t" | |
978 "punpcklbw %%mm7, %%mm0\n\t" | |
979 "punpcklbw %%mm7, %%mm1\n\t" | |
980 "punpckhbw %%mm7, %%mm2\n\t" | |
981 "punpckhbw %%mm7, %%mm3\n\t" | |
982 "paddusw %%mm1, %%mm0\n\t" | |
983 "paddusw %%mm3, %%mm2\n\t" | |
984 "movq %%mm4, %%mm1\n\t" | |
985 "movq %%mm5, %%mm3\n\t" | |
986 "punpcklbw %%mm7, %%mm4\n\t" | |
987 "punpcklbw %%mm7, %%mm5\n\t" | |
988 "punpckhbw %%mm7, %%mm1\n\t" | |
989 "punpckhbw %%mm7, %%mm3\n\t" | |
990 "paddusw %%mm5, %%mm4\n\t" | |
991 "paddusw %%mm3, %%mm1\n\t" | |
992 "paddusw %%mm6, %%mm4\n\t" | |
993 "paddusw %%mm6, %%mm1\n\t" | |
994 "paddusw %%mm4, %%mm0\n\t" | |
995 "paddusw %%mm1, %%mm2\n\t" | |
996 "movq %0, %%mm1\n\t" | |
997 "movq 8%0, %%mm3\n\t" | |
998 "psrlw $2, %%mm0\n\t" | |
999 "psrlw $2, %%mm2\n\t" | |
1000 "psubsw %%mm0, %%mm1\n\t" | |
1001 "psubsw %%mm2, %%mm3\n\t" | |
1002 "movq %%mm1, %0\n\t" | |
1003 "movq %%mm3, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
1004 :"+m"(*p) |
0 | 1005 :"m"(*pix), |
1006 "m"(*(pix+line_size)) | |
1007 :"memory"); | |
1008 pix += line_size; | |
1009 p += 8 ; | |
1010 } while(--h); | |
1011 } | |
1012 | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1013 static void just_return() { return; } |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1014 |
0 | 1015 void dsputil_init_mmx(void) |
1016 { | |
1017 mm_flags = mm_support(); | |
188 | 1018 #if 1 |
1019 printf("libavcodec: CPU flags:"); | |
0 | 1020 if (mm_flags & MM_MMX) |
1021 printf(" mmx"); | |
1022 if (mm_flags & MM_MMXEXT) | |
1023 printf(" mmxext"); | |
1024 if (mm_flags & MM_3DNOW) | |
1025 printf(" 3dnow"); | |
1026 if (mm_flags & MM_SSE) | |
1027 printf(" sse"); | |
1028 if (mm_flags & MM_SSE2) | |
1029 printf(" sse2"); | |
1030 printf("\n"); | |
1031 #endif | |
1032 | |
1033 if (mm_flags & MM_MMX) { | |
1034 get_pixels = get_pixels_mmx; | |
1035 put_pixels_clamped = put_pixels_clamped_mmx; | |
1036 add_pixels_clamped = add_pixels_clamped_mmx; | |
1037 | |
1038 pix_abs16x16 = pix_abs16x16_mmx; | |
1039 pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
1040 pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
1041 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; | |
1042 av_fdct = fdct_mmx; | |
1043 | |
1044 put_pixels_tab[0] = put_pixels_mmx; | |
1045 put_pixels_tab[1] = put_pixels_x2_mmx; | |
1046 put_pixels_tab[2] = put_pixels_y2_mmx; | |
1047 put_pixels_tab[3] = put_pixels_xy2_mmx; | |
1048 | |
1049 put_no_rnd_pixels_tab[0] = put_pixels_mmx; | |
1050 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
1051 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1052 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | |
1053 | |
1054 avg_pixels_tab[0] = avg_pixels_mmx; | |
1055 avg_pixels_tab[1] = avg_pixels_x2_mmx; | |
1056 avg_pixels_tab[2] = avg_pixels_y2_mmx; | |
1057 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1058 | |
1059 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | |
1060 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | |
1061 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | |
1062 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | |
1063 | |
1064 sub_pixels_tab[0] = sub_pixels_mmx; | |
1065 sub_pixels_tab[1] = sub_pixels_x2_mmx; | |
1066 sub_pixels_tab[2] = sub_pixels_y2_mmx; | |
1067 sub_pixels_tab[3] = sub_pixels_xy2_mmx; | |
1068 | |
1069 if (mm_flags & MM_MMXEXT) { | |
1070 pix_abs16x16 = pix_abs16x16_sse; | |
1071 } | |
1072 | |
1073 if (mm_flags & MM_SSE) { | |
1074 put_pixels_tab[1] = put_pixels_x2_sse; | |
1075 put_pixels_tab[2] = put_pixels_y2_sse; | |
1076 | |
1077 avg_pixels_tab[0] = avg_pixels_sse; | |
1078 avg_pixels_tab[1] = avg_pixels_x2_sse; | |
1079 avg_pixels_tab[2] = avg_pixels_y2_sse; | |
1080 avg_pixels_tab[3] = avg_pixels_xy2_sse; | |
1081 | |
1082 sub_pixels_tab[1] = sub_pixels_x2_sse; | |
1083 sub_pixels_tab[2] = sub_pixels_y2_sse; | |
1084 } else if (mm_flags & MM_3DNOW) { | |
1085 put_pixels_tab[1] = put_pixels_x2_3dnow; | |
1086 put_pixels_tab[2] = put_pixels_y2_3dnow; | |
1087 | |
1088 avg_pixels_tab[0] = avg_pixels_3dnow; | |
1089 avg_pixels_tab[1] = avg_pixels_x2_3dnow; | |
1090 avg_pixels_tab[2] = avg_pixels_y2_3dnow; | |
1091 avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | |
1092 | |
1093 sub_pixels_tab[1] = sub_pixels_x2_3dnow; | |
1094 sub_pixels_tab[2] = sub_pixels_y2_3dnow; | |
1095 } | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1096 |
42 | 1097 /* idct */ |
1098 if (mm_flags & MM_MMXEXT) { | |
1099 ff_idct = ff_mmxext_idct; | |
1100 } else { | |
1101 ff_idct = ff_mmx_idct; | |
1102 } | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1103 #ifdef SIMPLE_IDCT |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1104 // ff_idct = simple_idct; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1105 ff_idct = simple_idct_mmx; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1106 #endif |
0 | 1107 } |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1108 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1109 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1110 // for speed testing |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1111 get_pixels = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1112 put_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1113 add_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1114 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1115 pix_abs16x16 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1116 pix_abs16x16_x2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1117 pix_abs16x16_y2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1118 pix_abs16x16_xy2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1119 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1120 put_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1121 put_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1122 put_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1123 put_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1124 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1125 put_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1126 put_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1127 put_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1128 put_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1129 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1130 avg_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1131 avg_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1132 avg_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1133 avg_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1134 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1135 avg_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1136 avg_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1137 avg_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1138 avg_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1139 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1140 sub_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1141 sub_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1142 sub_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1143 sub_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1144 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1145 //av_fdct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1146 //ff_idct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1147 #endif |
0 | 1148 } |