Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 386:f49629bab18d libavcodec
hopefully faster mmx2&3dnow MC
author | michaelni |
---|---|
date | Fri, 17 May 2002 01:04:14 +0000 |
parents | 8635a7036395 |
children | b8f3affeb8e1 |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
3 * Copyright (c) 2000, 2001 Gerard Lantau. | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
18 * | |
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 */ | |
21 | |
22 #include "../dsputil.h" | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
23 #include "../simple_idct.h" |
386 | 24 #include "../mangle.h" |
0 | 25 |
5 | 26 int mm_flags; /* multimedia extension flags */ |
27 | |
294 | 28 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx); |
29 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
30 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
31 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
32 | |
33 int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
34 int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
35 int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
36 int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
37 | |
38 int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
39 int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
40 int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
41 int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
42 | |
43 int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
44 int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
45 int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
46 int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
47 | |
42 | 48 /* external functions, from idct_mmx.c */ |
49 void ff_mmx_idct(DCTELEM *block); | |
50 void ff_mmxext_idct(DCTELEM *block); | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
51 |
0 | 52 /* pixel operations */ |
386 | 53 static const unsigned long long int mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101LL; |
294 | 54 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001LL; |
55 static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002LL; | |
8 | 56 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; |
57 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |
0 | 58 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
59 #define JUMPALIGN() __asm __volatile (".balign 8"::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
60 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
61 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
62 #ifndef PIC |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
63 #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
64 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
65 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
66 // for shared library it's better to use this way for accessing constants |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
67 // pcmpeqd -> -1 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
68 #define MOVQ_WONE(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
69 __asm __volatile ( \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
70 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
71 "psrlw $15, %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
72 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
73 #define MOVQ_WTWO(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
74 __asm __volatile ( \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
75 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
76 "psrlw $15, %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
77 "psllw $1, %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
78 #endif |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
79 |
0 | 80 /***********************************/ |
81 /* 3Dnow specific */ | |
82 | |
83 #define DEF(x) x ## _3dnow | |
84 /* for Athlons PAVGUSB is prefered */ | |
85 #define PAVGB "pavgusb" | |
86 | |
87 #include "dsputil_mmx_avg.h" | |
88 | |
89 #undef DEF | |
90 #undef PAVGB | |
91 | |
92 /***********************************/ | |
93 /* MMX2 specific */ | |
94 | |
386 | 95 #define DEF(x) x ## _mmx2 |
0 | 96 |
97 /* Introduced only in MMX2 set */ | |
98 #define PAVGB "pavgb" | |
99 | |
100 #include "dsputil_mmx_avg.h" | |
101 | |
102 #undef DEF | |
103 #undef PAVGB | |
104 | |
105 /***********************************/ | |
106 /* standard MMX */ | |
107 | |
108 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
109 { | |
386 | 110 asm volatile( |
111 "movl $-128, %%eax \n\t" | |
112 "pxor %%mm7, %%mm7 \n\t" | |
113 ".balign 16 \n\t" | |
114 "1: \n\t" | |
115 "movq (%0), %%mm0 \n\t" | |
116 "movq (%0, %2), %%mm2 \n\t" | |
117 "movq %%mm0, %%mm1 \n\t" | |
118 "movq %%mm2, %%mm3 \n\t" | |
119 "punpcklbw %%mm7, %%mm0 \n\t" | |
120 "punpckhbw %%mm7, %%mm1 \n\t" | |
121 "punpcklbw %%mm7, %%mm2 \n\t" | |
122 "punpckhbw %%mm7, %%mm3 \n\t" | |
123 "movq %%mm0, (%1, %%eax)\n\t" | |
124 "movq %%mm1, 8(%1, %%eax)\n\t" | |
125 "movq %%mm2, 16(%1, %%eax)\n\t" | |
126 "movq %%mm3, 24(%1, %%eax)\n\t" | |
127 "addl %3, %0 \n\t" | |
128 "addl $32, %%eax \n\t" | |
129 "js 1b \n\t" | |
130 : "+r" (pixels) | |
131 : "r" (block+64), "r" (line_size), "r" (line_size*2) | |
132 : "%eax" | |
133 ); | |
0 | 134 } |
135 | |
324 | 136 static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) |
137 { | |
138 asm volatile( | |
386 | 139 "pxor %%mm7, %%mm7 \n\t" |
140 "movl $-128, %%eax \n\t" | |
324 | 141 ".balign 16 \n\t" |
142 "1: \n\t" | |
143 "movq (%0), %%mm0 \n\t" | |
144 "movq (%1), %%mm2 \n\t" | |
145 "movq %%mm0, %%mm1 \n\t" | |
146 "movq %%mm2, %%mm3 \n\t" | |
147 "punpcklbw %%mm7, %%mm0 \n\t" | |
148 "punpckhbw %%mm7, %%mm1 \n\t" | |
149 "punpcklbw %%mm7, %%mm2 \n\t" | |
150 "punpckhbw %%mm7, %%mm3 \n\t" | |
151 "psubw %%mm2, %%mm0 \n\t" | |
152 "psubw %%mm3, %%mm1 \n\t" | |
153 "movq %%mm0, (%2, %%eax)\n\t" | |
154 "movq %%mm1, 8(%2, %%eax)\n\t" | |
155 "addl %3, %0 \n\t" | |
156 "addl %3, %1 \n\t" | |
157 "addl $16, %%eax \n\t" | |
158 "jnz 1b \n\t" | |
159 : "+r" (s1), "+r" (s2) | |
160 : "r" (block+64), "r" (stride) | |
161 : "%eax" | |
162 ); | |
163 } | |
164 | |
0 | 165 static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) |
166 { | |
167 const DCTELEM *p; | |
168 UINT8 *pix; | |
169 | |
170 /* read the pixels */ | |
171 p = block; | |
172 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
173 /* unrolled loop */ |
0 | 174 __asm __volatile( |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
175 "movq %3, %%mm0\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
176 "movq 8%3, %%mm1\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
177 "movq 16%3, %%mm2\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
178 "movq 24%3, %%mm3\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
179 "movq 32%3, %%mm4\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
180 "movq 40%3, %%mm5\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
181 "movq 48%3, %%mm6\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
182 "movq 56%3, %%mm7\n\t" |
0 | 183 "packuswb %%mm1, %%mm0\n\t" |
184 "packuswb %%mm3, %%mm2\n\t" | |
185 "packuswb %%mm5, %%mm4\n\t" | |
186 "packuswb %%mm7, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
187 "movq %%mm0, (%0)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
188 "movq %%mm2, (%0, %1)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
189 "movq %%mm4, (%0, %1, 2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
190 "movq %%mm6, (%0, %2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
191 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) |
0 | 192 :"memory"); |
193 pix += line_size*4; | |
194 p += 32; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
195 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
196 // if here would be an exact copy of the code above |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
197 // compiler would generate some very strange code |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
198 // thus using "r" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
199 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
200 "movq (%3), %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
201 "movq 8(%3), %%mm1\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
202 "movq 16(%3), %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
203 "movq 24(%3), %%mm3\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
204 "movq 32(%3), %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
205 "movq 40(%3), %%mm5\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
206 "movq 48(%3), %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
207 "movq 56(%3), %%mm7\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
208 "packuswb %%mm1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
209 "packuswb %%mm3, %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
210 "packuswb %%mm5, %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
211 "packuswb %%mm7, %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
212 "movq %%mm0, (%0)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
213 "movq %%mm2, (%0, %1)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
214 "movq %%mm4, (%0, %1, 2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
215 "movq %%mm6, (%0, %2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
216 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
217 :"memory"); |
0 | 218 } |
219 | |
220 static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
221 { | |
222 const DCTELEM *p; | |
223 UINT8 *pix; | |
224 int i; | |
225 | |
226 /* read the pixels */ | |
227 p = block; | |
228 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
229 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
230 i = 4; |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
231 do { |
0 | 232 __asm __volatile( |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
233 "movq (%2), %%mm0\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
234 "movq 8(%2), %%mm1\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
235 "movq 16(%2), %%mm2\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
236 "movq 24(%2), %%mm3\n\t" |
0 | 237 "movq %0, %%mm4\n\t" |
238 "movq %1, %%mm6\n\t" | |
239 "movq %%mm4, %%mm5\n\t" | |
240 "punpcklbw %%mm7, %%mm4\n\t" | |
241 "punpckhbw %%mm7, %%mm5\n\t" | |
242 "paddsw %%mm4, %%mm0\n\t" | |
243 "paddsw %%mm5, %%mm1\n\t" | |
244 "movq %%mm6, %%mm5\n\t" | |
245 "punpcklbw %%mm7, %%mm6\n\t" | |
246 "punpckhbw %%mm7, %%mm5\n\t" | |
247 "paddsw %%mm6, %%mm2\n\t" | |
248 "paddsw %%mm5, %%mm3\n\t" | |
249 "packuswb %%mm1, %%mm0\n\t" | |
250 "packuswb %%mm3, %%mm2\n\t" | |
251 "movq %%mm0, %0\n\t" | |
252 "movq %%mm2, %1\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
253 :"+m"(*pix), "+m"(*(pix+line_size)) |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
254 :"r"(p) |
0 | 255 :"memory"); |
256 pix += line_size*2; | |
257 p += 16; | |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
258 } while (--i); |
0 | 259 } |
260 | |
261 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
262 { | |
386 | 263 #if 0 //FIXME h==4 case |
264 asm volatile( | |
265 "xorl %%eax, %%eax \n\t" | |
266 "movl %3, %%esi \n\t" | |
267 "1: \n\t" | |
268 "movq (%1, %%eax), %%mm0 \n\t" | |
269 "movq %%mm0, (%0, %%eax) \n\t" | |
270 "addl %2, %%eax \n\t" | |
271 "movq (%1, %%eax), %%mm0 \n\t" | |
272 "movq %%mm0, (%0, %%eax) \n\t" | |
273 "addl %2, %%eax \n\t" | |
274 "movq (%1, %%eax), %%mm0 \n\t" | |
275 "movq %%mm0, (%0, %%eax) \n\t" | |
276 "addl %2, %%eax \n\t" | |
277 "movq (%1, %%eax), %%mm0 \n\t" | |
278 "movq %%mm0, (%0, %%eax) \n\t" | |
279 "addl %2, %%eax \n\t" | |
280 "movq (%1, %%eax), %%mm0 \n\t" | |
281 "movq %%mm0, (%0, %%eax) \n\t" | |
282 "addl %2, %%eax \n\t" | |
283 "movq (%1, %%eax), %%mm0 \n\t" | |
284 "movq %%mm0, (%0, %%eax) \n\t" | |
285 "addl %2, %%eax \n\t" | |
286 "movq (%1, %%eax), %%mm0 \n\t" | |
287 "movq %%mm0, (%0, %%eax) \n\t" | |
288 "addl %2, %%eax \n\t" | |
289 "movq (%1, %%eax), %%mm0 \n\t" | |
290 "movq %%mm0, (%0, %%eax) \n\t" | |
291 "addl %2, %%eax \n\t" | |
292 "subl $8, %%esi \n\t" | |
293 " jnz 1b \n\t" | |
294 :: "r" (block), "r" (pixels), "r"(line_size), "m"(h) | |
295 : "%eax", "%esi", "memory" | |
296 ); | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
297 #else |
386 | 298 asm volatile( |
299 "xorl %%eax, %%eax \n\t" | |
300 "movl %3, %%esi \n\t" | |
301 "1: \n\t" | |
302 "movq (%1, %%eax), %%mm0 \n\t" | |
303 "movq %%mm0, (%0, %%eax) \n\t" | |
304 "addl %2, %%eax \n\t" | |
305 "movq (%1, %%eax), %%mm0 \n\t" | |
306 "movq %%mm0, (%0, %%eax) \n\t" | |
307 "addl %2, %%eax \n\t" | |
308 "movq (%1, %%eax), %%mm0 \n\t" | |
309 "movq %%mm0, (%0, %%eax) \n\t" | |
310 "addl %2, %%eax \n\t" | |
311 "movq (%1, %%eax), %%mm0 \n\t" | |
312 "movq %%mm0, (%0, %%eax) \n\t" | |
313 "addl %2, %%eax \n\t" | |
314 "subl $4, %%esi \n\t" | |
315 " jnz 1b \n\t" | |
316 :: "r" (block), "r" (pixels), "r"(line_size), "m"(h) | |
317 : "%eax", "%esi", "memory" | |
318 ); | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
319 #endif |
0 | 320 } |
321 | |
322 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
323 { | |
324 UINT8 *p; | |
325 const UINT8 *pix; | |
326 p = block; | |
327 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
328 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
329 MOVQ_WONE(mm4); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
330 JUMPALIGN(); |
0 | 331 do { |
332 __asm __volatile( | |
333 "movq %1, %%mm0\n\t" | |
334 "movq 1%1, %%mm1\n\t" | |
335 "movq %%mm0, %%mm2\n\t" | |
336 "movq %%mm1, %%mm3\n\t" | |
337 "punpcklbw %%mm7, %%mm0\n\t" | |
338 "punpcklbw %%mm7, %%mm1\n\t" | |
339 "punpckhbw %%mm7, %%mm2\n\t" | |
340 "punpckhbw %%mm7, %%mm3\n\t" | |
341 "paddusw %%mm1, %%mm0\n\t" | |
342 "paddusw %%mm3, %%mm2\n\t" | |
343 "paddusw %%mm4, %%mm0\n\t" | |
344 "paddusw %%mm4, %%mm2\n\t" | |
345 "psrlw $1, %%mm0\n\t" | |
346 "psrlw $1, %%mm2\n\t" | |
347 "packuswb %%mm2, %%mm0\n\t" | |
348 "movq %%mm0, %0\n\t" | |
349 :"=m"(*p) | |
350 :"m"(*pix) | |
351 :"memory"); | |
352 pix += line_size; p += line_size; | |
353 } while (--h); | |
354 } | |
355 | |
356 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
357 { | |
358 UINT8 *p; | |
359 const UINT8 *pix; | |
360 p = block; | |
361 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
362 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
363 MOVQ_WONE(mm4); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
364 JUMPALIGN(); |
0 | 365 do { |
366 __asm __volatile( | |
367 "movq %1, %%mm0\n\t" | |
368 "movq %2, %%mm1\n\t" | |
369 "movq %%mm0, %%mm2\n\t" | |
370 "movq %%mm1, %%mm3\n\t" | |
371 "punpcklbw %%mm7, %%mm0\n\t" | |
372 "punpcklbw %%mm7, %%mm1\n\t" | |
373 "punpckhbw %%mm7, %%mm2\n\t" | |
374 "punpckhbw %%mm7, %%mm3\n\t" | |
375 "paddusw %%mm1, %%mm0\n\t" | |
376 "paddusw %%mm3, %%mm2\n\t" | |
377 "paddusw %%mm4, %%mm0\n\t" | |
378 "paddusw %%mm4, %%mm2\n\t" | |
379 "psrlw $1, %%mm0\n\t" | |
380 "psrlw $1, %%mm2\n\t" | |
381 "packuswb %%mm2, %%mm0\n\t" | |
382 "movq %%mm0, %0\n\t" | |
383 :"=m"(*p) | |
384 :"m"(*pix), | |
385 "m"(*(pix+line_size)) | |
386 :"memory"); | |
387 pix += line_size; | |
388 p += line_size; | |
389 } while (--h); | |
390 } | |
391 | |
392 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
393 { | |
394 UINT8 *p; | |
395 const UINT8 *pix; | |
396 p = block; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
397 pix = pixels; // 1s |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
398 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
399 MOVQ_WTWO(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
400 JUMPALIGN(); |
0 | 401 do { |
402 __asm __volatile( | |
403 "movq %1, %%mm0\n\t" | |
404 "movq %2, %%mm1\n\t" | |
405 "movq 1%1, %%mm4\n\t" | |
406 "movq 1%2, %%mm5\n\t" | |
407 "movq %%mm0, %%mm2\n\t" | |
408 "movq %%mm1, %%mm3\n\t" | |
409 "punpcklbw %%mm7, %%mm0\n\t" | |
410 "punpcklbw %%mm7, %%mm1\n\t" | |
411 "punpckhbw %%mm7, %%mm2\n\t" | |
412 "punpckhbw %%mm7, %%mm3\n\t" | |
413 "paddusw %%mm1, %%mm0\n\t" | |
414 "paddusw %%mm3, %%mm2\n\t" | |
415 "movq %%mm4, %%mm1\n\t" | |
416 "movq %%mm5, %%mm3\n\t" | |
417 "punpcklbw %%mm7, %%mm4\n\t" | |
418 "punpcklbw %%mm7, %%mm5\n\t" | |
419 "punpckhbw %%mm7, %%mm1\n\t" | |
420 "punpckhbw %%mm7, %%mm3\n\t" | |
421 "paddusw %%mm5, %%mm4\n\t" | |
422 "paddusw %%mm3, %%mm1\n\t" | |
423 "paddusw %%mm6, %%mm4\n\t" | |
424 "paddusw %%mm6, %%mm1\n\t" | |
425 "paddusw %%mm4, %%mm0\n\t" | |
426 "paddusw %%mm1, %%mm2\n\t" | |
427 "psrlw $2, %%mm0\n\t" | |
428 "psrlw $2, %%mm2\n\t" | |
429 "packuswb %%mm2, %%mm0\n\t" | |
430 "movq %%mm0, %0\n\t" | |
431 :"=m"(*p) | |
432 :"m"(*pix), | |
433 "m"(*(pix+line_size)) | |
434 :"memory"); | |
435 pix += line_size; | |
436 p += line_size; | |
437 } while(--h); | |
438 } | |
439 | |
440 static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
441 { | |
442 UINT8 *p; | |
443 const UINT8 *pix; | |
444 p = block; | |
445 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
446 MOVQ_ZERO(mm7); |
0 | 447 do { |
448 __asm __volatile( | |
449 "movq %1, %%mm0\n\t" | |
450 "movq 1%1, %%mm1\n\t" | |
451 "movq %%mm0, %%mm2\n\t" | |
452 "movq %%mm1, %%mm3\n\t" | |
453 "punpcklbw %%mm7, %%mm0\n\t" | |
454 "punpcklbw %%mm7, %%mm1\n\t" | |
455 "punpckhbw %%mm7, %%mm2\n\t" | |
456 "punpckhbw %%mm7, %%mm3\n\t" | |
457 "paddusw %%mm1, %%mm0\n\t" | |
458 "paddusw %%mm3, %%mm2\n\t" | |
459 "psrlw $1, %%mm0\n\t" | |
460 "psrlw $1, %%mm2\n\t" | |
461 "packuswb %%mm2, %%mm0\n\t" | |
462 "movq %%mm0, %0\n\t" | |
463 :"=m"(*p) | |
464 :"m"(*pix) | |
465 :"memory"); | |
466 pix += line_size; | |
467 p += line_size; | |
468 } while (--h); | |
469 } | |
470 | |
471 static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
472 { | |
473 UINT8 *p; | |
474 const UINT8 *pix; | |
475 p = block; | |
476 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
477 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
478 JUMPALIGN(); |
0 | 479 do { |
480 __asm __volatile( | |
481 "movq %1, %%mm0\n\t" | |
482 "movq %2, %%mm1\n\t" | |
483 "movq %%mm0, %%mm2\n\t" | |
484 "movq %%mm1, %%mm3\n\t" | |
485 "punpcklbw %%mm7, %%mm0\n\t" | |
486 "punpcklbw %%mm7, %%mm1\n\t" | |
487 "punpckhbw %%mm7, %%mm2\n\t" | |
488 "punpckhbw %%mm7, %%mm3\n\t" | |
489 "paddusw %%mm1, %%mm0\n\t" | |
490 "paddusw %%mm3, %%mm2\n\t" | |
491 "psrlw $1, %%mm0\n\t" | |
492 "psrlw $1, %%mm2\n\t" | |
493 "packuswb %%mm2, %%mm0\n\t" | |
494 "movq %%mm0, %0\n\t" | |
495 :"=m"(*p) | |
496 :"m"(*pix), | |
497 "m"(*(pix+line_size)) | |
498 :"memory"); | |
499 pix += line_size; | |
500 p += line_size; | |
501 } while(--h); | |
502 } | |
503 | |
504 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
505 { | |
506 UINT8 *p; | |
507 const UINT8 *pix; | |
508 p = block; | |
509 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
510 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
511 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
512 JUMPALIGN(); |
0 | 513 do { |
514 __asm __volatile( | |
515 "movq %1, %%mm0\n\t" | |
516 "movq %2, %%mm1\n\t" | |
517 "movq 1%1, %%mm4\n\t" | |
518 "movq 1%2, %%mm5\n\t" | |
519 "movq %%mm0, %%mm2\n\t" | |
520 "movq %%mm1, %%mm3\n\t" | |
521 "punpcklbw %%mm7, %%mm0\n\t" | |
522 "punpcklbw %%mm7, %%mm1\n\t" | |
523 "punpckhbw %%mm7, %%mm2\n\t" | |
524 "punpckhbw %%mm7, %%mm3\n\t" | |
525 "paddusw %%mm1, %%mm0\n\t" | |
526 "paddusw %%mm3, %%mm2\n\t" | |
527 "movq %%mm4, %%mm1\n\t" | |
528 "movq %%mm5, %%mm3\n\t" | |
529 "punpcklbw %%mm7, %%mm4\n\t" | |
530 "punpcklbw %%mm7, %%mm5\n\t" | |
531 "punpckhbw %%mm7, %%mm1\n\t" | |
532 "punpckhbw %%mm7, %%mm3\n\t" | |
533 "paddusw %%mm5, %%mm4\n\t" | |
534 "paddusw %%mm3, %%mm1\n\t" | |
535 "paddusw %%mm6, %%mm4\n\t" | |
536 "paddusw %%mm6, %%mm1\n\t" | |
537 "paddusw %%mm4, %%mm0\n\t" | |
538 "paddusw %%mm1, %%mm2\n\t" | |
539 "psrlw $2, %%mm0\n\t" | |
540 "psrlw $2, %%mm2\n\t" | |
541 "packuswb %%mm2, %%mm0\n\t" | |
542 "movq %%mm0, %0\n\t" | |
543 :"=m"(*p) | |
544 :"m"(*pix), | |
545 "m"(*(pix+line_size)) | |
546 :"memory"); | |
547 pix += line_size; | |
548 p += line_size; | |
549 } while(--h); | |
550 } | |
551 | |
552 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
553 { | |
554 UINT8 *p; | |
555 const UINT8 *pix; | |
556 p = block; | |
557 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
558 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
559 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
560 JUMPALIGN(); |
0 | 561 do { |
562 __asm __volatile( | |
563 "movq %0, %%mm0\n\t" | |
564 "movq %1, %%mm1\n\t" | |
565 "movq %%mm0, %%mm2\n\t" | |
566 "movq %%mm1, %%mm3\n\t" | |
567 "punpcklbw %%mm7, %%mm0\n\t" | |
568 "punpcklbw %%mm7, %%mm1\n\t" | |
569 "punpckhbw %%mm7, %%mm2\n\t" | |
570 "punpckhbw %%mm7, %%mm3\n\t" | |
571 "paddusw %%mm1, %%mm0\n\t" | |
572 "paddusw %%mm3, %%mm2\n\t" | |
573 "paddusw %%mm6, %%mm0\n\t" | |
574 "paddusw %%mm6, %%mm2\n\t" | |
575 "psrlw $1, %%mm0\n\t" | |
576 "psrlw $1, %%mm2\n\t" | |
577 "packuswb %%mm2, %%mm0\n\t" | |
578 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
579 :"+m"(*p) |
0 | 580 :"m"(*pix) |
581 :"memory"); | |
582 pix += line_size; | |
583 p += line_size; | |
584 } | |
585 while (--h); | |
586 } | |
587 | |
588 static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
589 { | |
590 UINT8 *p; | |
591 const UINT8 *pix; | |
592 p = block; | |
593 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
594 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
595 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
596 JUMPALIGN(); |
0 | 597 do { |
598 __asm __volatile( | |
599 "movq %1, %%mm1\n\t" | |
600 "movq %0, %%mm0\n\t" | |
601 "movq 1%1, %%mm4\n\t" | |
602 "movq %%mm0, %%mm2\n\t" | |
603 "movq %%mm1, %%mm3\n\t" | |
604 "movq %%mm4, %%mm5\n\t" | |
605 "punpcklbw %%mm7, %%mm1\n\t" | |
606 "punpckhbw %%mm7, %%mm3\n\t" | |
607 "punpcklbw %%mm7, %%mm4\n\t" | |
608 "punpckhbw %%mm7, %%mm5\n\t" | |
609 "punpcklbw %%mm7, %%mm0\n\t" | |
610 "punpckhbw %%mm7, %%mm2\n\t" | |
611 "paddusw %%mm4, %%mm1\n\t" | |
612 "paddusw %%mm5, %%mm3\n\t" | |
613 "paddusw %%mm6, %%mm1\n\t" | |
614 "paddusw %%mm6, %%mm3\n\t" | |
615 "psrlw $1, %%mm1\n\t" | |
616 "psrlw $1, %%mm3\n\t" | |
617 "paddusw %%mm6, %%mm0\n\t" | |
618 "paddusw %%mm6, %%mm2\n\t" | |
619 "paddusw %%mm1, %%mm0\n\t" | |
620 "paddusw %%mm3, %%mm2\n\t" | |
621 "psrlw $1, %%mm0\n\t" | |
622 "psrlw $1, %%mm2\n\t" | |
623 "packuswb %%mm2, %%mm0\n\t" | |
624 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
625 :"+m"(*p) |
0 | 626 :"m"(*pix) |
627 :"memory"); | |
628 pix += line_size; | |
629 p += line_size; | |
630 } while (--h); | |
631 } | |
632 | |
633 static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
634 { | |
635 UINT8 *p; | |
636 const UINT8 *pix; | |
637 p = block; | |
638 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
639 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
640 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
641 JUMPALIGN(); |
0 | 642 do { |
643 __asm __volatile( | |
644 "movq %1, %%mm1\n\t" | |
645 "movq %0, %%mm0\n\t" | |
646 "movq %2, %%mm4\n\t" | |
647 "movq %%mm0, %%mm2\n\t" | |
648 "movq %%mm1, %%mm3\n\t" | |
649 "movq %%mm4, %%mm5\n\t" | |
650 "punpcklbw %%mm7, %%mm1\n\t" | |
651 "punpckhbw %%mm7, %%mm3\n\t" | |
652 "punpcklbw %%mm7, %%mm4\n\t" | |
653 "punpckhbw %%mm7, %%mm5\n\t" | |
654 "punpcklbw %%mm7, %%mm0\n\t" | |
655 "punpckhbw %%mm7, %%mm2\n\t" | |
656 "paddusw %%mm4, %%mm1\n\t" | |
657 "paddusw %%mm5, %%mm3\n\t" | |
658 "paddusw %%mm6, %%mm1\n\t" | |
659 "paddusw %%mm6, %%mm3\n\t" | |
660 "psrlw $1, %%mm1\n\t" | |
661 "psrlw $1, %%mm3\n\t" | |
662 "paddusw %%mm6, %%mm0\n\t" | |
663 "paddusw %%mm6, %%mm2\n\t" | |
664 "paddusw %%mm1, %%mm0\n\t" | |
665 "paddusw %%mm3, %%mm2\n\t" | |
666 "psrlw $1, %%mm0\n\t" | |
667 "psrlw $1, %%mm2\n\t" | |
668 "packuswb %%mm2, %%mm0\n\t" | |
669 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
670 :"+m"(*p) |
0 | 671 :"m"(*pix), "m"(*(pix+line_size)) |
672 :"memory"); | |
673 pix += line_size; | |
674 p += line_size ; | |
675 } while(--h); | |
676 } | |
677 | |
678 static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
679 { | |
680 UINT8 *p; | |
681 const UINT8 *pix; | |
682 p = block; | |
683 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
684 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
685 // this doesn't seem to be used offten - so |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
686 // the inside usage of mm_wone is not optimized |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
687 MOVQ_WTWO(mm6); |
0 | 688 do { |
689 __asm __volatile( | |
690 "movq %1, %%mm0\n\t" | |
691 "movq %2, %%mm1\n\t" | |
692 "movq 1%1, %%mm4\n\t" | |
693 "movq 1%2, %%mm5\n\t" | |
694 "movq %%mm0, %%mm2\n\t" | |
695 "movq %%mm1, %%mm3\n\t" | |
696 "punpcklbw %%mm7, %%mm0\n\t" | |
697 "punpcklbw %%mm7, %%mm1\n\t" | |
698 "punpckhbw %%mm7, %%mm2\n\t" | |
699 "punpckhbw %%mm7, %%mm3\n\t" | |
700 "paddusw %%mm1, %%mm0\n\t" | |
701 "paddusw %%mm3, %%mm2\n\t" | |
702 "movq %%mm4, %%mm1\n\t" | |
703 "movq %%mm5, %%mm3\n\t" | |
704 "punpcklbw %%mm7, %%mm4\n\t" | |
705 "punpcklbw %%mm7, %%mm5\n\t" | |
706 "punpckhbw %%mm7, %%mm1\n\t" | |
707 "punpckhbw %%mm7, %%mm3\n\t" | |
708 "paddusw %%mm5, %%mm4\n\t" | |
709 "paddusw %%mm3, %%mm1\n\t" | |
710 "paddusw %%mm6, %%mm4\n\t" | |
711 "paddusw %%mm6, %%mm1\n\t" | |
712 "paddusw %%mm4, %%mm0\n\t" | |
713 "paddusw %%mm1, %%mm2\n\t" | |
714 "movq %3, %%mm5\n\t" | |
715 "psrlw $2, %%mm0\n\t" | |
716 "movq %0, %%mm1\n\t" | |
717 "psrlw $2, %%mm2\n\t" | |
718 "movq %%mm1, %%mm3\n\t" | |
719 "punpcklbw %%mm7, %%mm1\n\t" | |
720 "punpckhbw %%mm7, %%mm3\n\t" | |
721 "paddusw %%mm1, %%mm0\n\t" | |
722 "paddusw %%mm3, %%mm2\n\t" | |
723 "paddusw %%mm5, %%mm0\n\t" | |
724 "paddusw %%mm5, %%mm2\n\t" | |
725 "psrlw $1, %%mm0\n\t" | |
726 "psrlw $1, %%mm2\n\t" | |
727 "packuswb %%mm2, %%mm0\n\t" | |
728 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
729 :"+m"(*p) |
0 | 730 :"m"(*pix), |
8 | 731 "m"(*(pix+line_size)), "m"(mm_wone) |
0 | 732 :"memory"); |
733 pix += line_size; | |
734 p += line_size ; | |
735 } while(--h); | |
736 } | |
737 | |
738 static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
739 { | |
740 UINT8 *p; | |
741 const UINT8 *pix; | |
742 p = block; | |
743 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
744 MOVQ_ZERO(mm7); |
0 | 745 do { |
746 __asm __volatile( | |
747 "movq %1, %%mm0\n\t" | |
748 "movq %0, %%mm1\n\t" | |
749 "movq %%mm0, %%mm2\n\t" | |
750 "movq %%mm1, %%mm3\n\t" | |
751 "punpcklbw %%mm7, %%mm0\n\t" | |
752 "punpcklbw %%mm7, %%mm1\n\t" | |
753 "punpckhbw %%mm7, %%mm2\n\t" | |
754 "punpckhbw %%mm7, %%mm3\n\t" | |
755 "paddusw %%mm1, %%mm0\n\t" | |
756 "paddusw %%mm3, %%mm2\n\t" | |
757 "psrlw $1, %%mm0\n\t" | |
758 "psrlw $1, %%mm2\n\t" | |
759 "packuswb %%mm2, %%mm0\n\t" | |
760 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
761 :"+m"(*p) |
0 | 762 :"m"(*pix) |
763 :"memory"); | |
764 pix += line_size; | |
765 p += line_size ; | |
766 } while (--h); | |
767 } | |
768 | |
769 static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
770 { | |
771 UINT8 *p; | |
772 const UINT8 *pix; | |
773 p = block; | |
774 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
775 MOVQ_ZERO(mm7); |
0 | 776 do { |
777 __asm __volatile( | |
778 "movq %1, %%mm0\n\t" | |
779 "movq 1%1, %%mm1\n\t" | |
780 "movq %0, %%mm4\n\t" | |
781 "movq %%mm0, %%mm2\n\t" | |
782 "movq %%mm1, %%mm3\n\t" | |
783 "movq %%mm4, %%mm5\n\t" | |
784 "punpcklbw %%mm7, %%mm0\n\t" | |
785 "punpcklbw %%mm7, %%mm1\n\t" | |
786 "punpckhbw %%mm7, %%mm2\n\t" | |
787 "punpckhbw %%mm7, %%mm3\n\t" | |
788 "punpcklbw %%mm7, %%mm4\n\t" | |
789 "punpckhbw %%mm7, %%mm5\n\t" | |
790 "paddusw %%mm1, %%mm0\n\t" | |
791 "paddusw %%mm3, %%mm2\n\t" | |
792 "psrlw $1, %%mm0\n\t" | |
793 "psrlw $1, %%mm2\n\t" | |
794 "paddusw %%mm4, %%mm0\n\t" | |
795 "paddusw %%mm5, %%mm2\n\t" | |
796 "psrlw $1, %%mm0\n\t" | |
797 "psrlw $1, %%mm2\n\t" | |
798 "packuswb %%mm2, %%mm0\n\t" | |
799 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
800 :"+m"(*p) |
0 | 801 :"m"(*pix) |
802 :"memory"); | |
803 pix += line_size; | |
804 p += line_size; | |
805 } while (--h); | |
806 } | |
807 | |
808 static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
809 { | |
810 UINT8 *p; | |
811 const UINT8 *pix; | |
812 p = block; | |
813 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
814 MOVQ_ZERO(mm7); |
0 | 815 do { |
816 __asm __volatile( | |
817 "movq %1, %%mm0\n\t" | |
818 "movq %2, %%mm1\n\t" | |
819 "movq %0, %%mm4\n\t" | |
820 "movq %%mm0, %%mm2\n\t" | |
821 "movq %%mm1, %%mm3\n\t" | |
822 "movq %%mm4, %%mm5\n\t" | |
823 "punpcklbw %%mm7, %%mm0\n\t" | |
824 "punpcklbw %%mm7, %%mm1\n\t" | |
825 "punpckhbw %%mm7, %%mm2\n\t" | |
826 "punpckhbw %%mm7, %%mm3\n\t" | |
827 "punpcklbw %%mm7, %%mm4\n\t" | |
828 "punpckhbw %%mm7, %%mm5\n\t" | |
829 "paddusw %%mm1, %%mm0\n\t" | |
830 "paddusw %%mm3, %%mm2\n\t" | |
831 "psrlw $1, %%mm0\n\t" | |
832 "psrlw $1, %%mm2\n\t" | |
833 "paddusw %%mm4, %%mm0\n\t" | |
834 "paddusw %%mm5, %%mm2\n\t" | |
835 "psrlw $1, %%mm0\n\t" | |
836 "psrlw $1, %%mm2\n\t" | |
837 "packuswb %%mm2, %%mm0\n\t" | |
838 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
839 :"+m"(*p) |
0 | 840 :"m"(*pix), "m"(*(pix+line_size)) |
841 :"memory"); | |
842 pix += line_size; | |
843 p += line_size ; | |
844 } while(--h); | |
845 } | |
846 | |
847 static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
848 { | |
849 UINT8 *p; | |
850 const UINT8 *pix; | |
851 p = block; | |
852 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
853 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
854 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
855 JUMPALIGN(); |
0 | 856 do { |
857 __asm __volatile( | |
858 "movq %1, %%mm0\n\t" | |
859 "movq %2, %%mm1\n\t" | |
860 "movq 1%1, %%mm4\n\t" | |
861 "movq 1%2, %%mm5\n\t" | |
862 "movq %%mm0, %%mm2\n\t" | |
863 "movq %%mm1, %%mm3\n\t" | |
864 "punpcklbw %%mm7, %%mm0\n\t" | |
865 "punpcklbw %%mm7, %%mm1\n\t" | |
866 "punpckhbw %%mm7, %%mm2\n\t" | |
867 "punpckhbw %%mm7, %%mm3\n\t" | |
868 "paddusw %%mm1, %%mm0\n\t" | |
869 "paddusw %%mm3, %%mm2\n\t" | |
870 "movq %%mm4, %%mm1\n\t" | |
871 "movq %%mm5, %%mm3\n\t" | |
872 "punpcklbw %%mm7, %%mm4\n\t" | |
873 "punpcklbw %%mm7, %%mm5\n\t" | |
874 "punpckhbw %%mm7, %%mm1\n\t" | |
875 "punpckhbw %%mm7, %%mm3\n\t" | |
876 "paddusw %%mm5, %%mm4\n\t" | |
877 "paddusw %%mm3, %%mm1\n\t" | |
878 "paddusw %%mm6, %%mm4\n\t" | |
879 "paddusw %%mm6, %%mm1\n\t" | |
880 "paddusw %%mm4, %%mm0\n\t" | |
881 "paddusw %%mm1, %%mm2\n\t" | |
882 "movq %0, %%mm1\n\t" | |
883 "psrlw $2, %%mm0\n\t" | |
884 "movq %%mm1, %%mm3\n\t" | |
885 "psrlw $2, %%mm2\n\t" | |
886 "punpcklbw %%mm7, %%mm1\n\t" | |
887 "punpckhbw %%mm7, %%mm3\n\t" | |
888 "paddusw %%mm1, %%mm0\n\t" | |
889 "paddusw %%mm3, %%mm2\n\t" | |
890 "psrlw $1, %%mm0\n\t" | |
891 "psrlw $1, %%mm2\n\t" | |
892 "packuswb %%mm2, %%mm0\n\t" | |
893 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
894 :"+m"(*p) |
0 | 895 :"m"(*pix), |
896 "m"(*(pix+line_size)) | |
897 :"memory"); | |
898 pix += line_size; | |
899 p += line_size; | |
900 } while(--h); | |
901 } | |
902 | |
903 static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
904 { | |
905 DCTELEM *p; | |
906 const UINT8 *pix; | |
907 p = block; | |
908 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
909 MOVQ_ZERO(mm7); |
0 | 910 do { |
911 __asm __volatile( | |
912 "movq %0, %%mm0\n\t" | |
913 "movq %1, %%mm2\n\t" | |
914 "movq 8%0, %%mm1\n\t" | |
915 "movq %%mm2, %%mm3\n\t" | |
916 "punpcklbw %%mm7, %%mm2\n\t" | |
917 "punpckhbw %%mm7, %%mm3\n\t" | |
918 "psubsw %%mm2, %%mm0\n\t" | |
919 "psubsw %%mm3, %%mm1\n\t" | |
920 "movq %%mm0, %0\n\t" | |
921 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
922 :"+m"(*p) |
0 | 923 :"m"(*pix) |
924 :"memory"); | |
925 pix += line_size; | |
926 p += 8; | |
927 } while (--h); | |
928 } | |
929 | |
930 static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
931 { | |
932 DCTELEM *p; | |
933 const UINT8 *pix; | |
934 p = block; | |
935 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
936 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
937 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
938 JUMPALIGN(); |
0 | 939 do { |
940 __asm __volatile( | |
941 "movq %0, %%mm0\n\t" | |
942 "movq %1, %%mm2\n\t" | |
943 "movq 8%0, %%mm1\n\t" | |
944 "movq 1%1, %%mm4\n\t" | |
945 "movq %%mm2, %%mm3\n\t" | |
946 "movq %%mm4, %%mm5\n\t" | |
947 "punpcklbw %%mm7, %%mm2\n\t" | |
948 "punpckhbw %%mm7, %%mm3\n\t" | |
949 "punpcklbw %%mm7, %%mm4\n\t" | |
950 "punpckhbw %%mm7, %%mm5\n\t" | |
951 "paddusw %%mm4, %%mm2\n\t" | |
952 "paddusw %%mm5, %%mm3\n\t" | |
953 "paddusw %%mm6, %%mm2\n\t" | |
954 "paddusw %%mm6, %%mm3\n\t" | |
955 "psrlw $1, %%mm2\n\t" | |
956 "psrlw $1, %%mm3\n\t" | |
957 "psubsw %%mm2, %%mm0\n\t" | |
958 "psubsw %%mm3, %%mm1\n\t" | |
959 "movq %%mm0, %0\n\t" | |
960 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
961 :"+m"(*p) |
0 | 962 :"m"(*pix) |
963 :"memory"); | |
964 pix += line_size; | |
965 p += 8; | |
966 } while (--h); | |
967 } | |
968 | |
969 static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
970 { | |
971 DCTELEM *p; | |
972 const UINT8 *pix; | |
973 p = block; | |
974 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
975 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
976 MOVQ_WONE(mm6); |
0 | 977 do { |
978 __asm __volatile( | |
979 "movq %0, %%mm0\n\t" | |
980 "movq %1, %%mm2\n\t" | |
981 "movq 8%0, %%mm1\n\t" | |
982 "movq %2, %%mm4\n\t" | |
983 "movq %%mm2, %%mm3\n\t" | |
984 "movq %%mm4, %%mm5\n\t" | |
985 "punpcklbw %%mm7, %%mm2\n\t" | |
986 "punpckhbw %%mm7, %%mm3\n\t" | |
987 "punpcklbw %%mm7, %%mm4\n\t" | |
988 "punpckhbw %%mm7, %%mm5\n\t" | |
989 "paddusw %%mm4, %%mm2\n\t" | |
990 "paddusw %%mm5, %%mm3\n\t" | |
991 "paddusw %%mm6, %%mm2\n\t" | |
992 "paddusw %%mm6, %%mm3\n\t" | |
993 "psrlw $1, %%mm2\n\t" | |
994 "psrlw $1, %%mm3\n\t" | |
995 "psubsw %%mm2, %%mm0\n\t" | |
996 "psubsw %%mm3, %%mm1\n\t" | |
997 "movq %%mm0, %0\n\t" | |
998 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
999 :"+m"(*p) |
0 | 1000 :"m"(*pix), "m"(*(pix+line_size)) |
1001 :"memory"); | |
1002 pix += line_size; | |
1003 p += 8; | |
1004 } while (--h); | |
1005 } | |
1006 | |
1007 static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
1008 { | |
1009 DCTELEM *p; | |
1010 const UINT8 *pix; | |
1011 p = block; | |
1012 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1013 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1014 MOVQ_WTWO(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1015 JUMPALIGN(); |
0 | 1016 do { |
1017 __asm __volatile( | |
1018 "movq %1, %%mm0\n\t" | |
1019 "movq %2, %%mm1\n\t" | |
1020 "movq 1%1, %%mm4\n\t" | |
1021 "movq 1%2, %%mm5\n\t" | |
1022 "movq %%mm0, %%mm2\n\t" | |
1023 "movq %%mm1, %%mm3\n\t" | |
1024 "punpcklbw %%mm7, %%mm0\n\t" | |
1025 "punpcklbw %%mm7, %%mm1\n\t" | |
1026 "punpckhbw %%mm7, %%mm2\n\t" | |
1027 "punpckhbw %%mm7, %%mm3\n\t" | |
1028 "paddusw %%mm1, %%mm0\n\t" | |
1029 "paddusw %%mm3, %%mm2\n\t" | |
1030 "movq %%mm4, %%mm1\n\t" | |
1031 "movq %%mm5, %%mm3\n\t" | |
1032 "punpcklbw %%mm7, %%mm4\n\t" | |
1033 "punpcklbw %%mm7, %%mm5\n\t" | |
1034 "punpckhbw %%mm7, %%mm1\n\t" | |
1035 "punpckhbw %%mm7, %%mm3\n\t" | |
1036 "paddusw %%mm5, %%mm4\n\t" | |
1037 "paddusw %%mm3, %%mm1\n\t" | |
1038 "paddusw %%mm6, %%mm4\n\t" | |
1039 "paddusw %%mm6, %%mm1\n\t" | |
1040 "paddusw %%mm4, %%mm0\n\t" | |
1041 "paddusw %%mm1, %%mm2\n\t" | |
1042 "movq %0, %%mm1\n\t" | |
1043 "movq 8%0, %%mm3\n\t" | |
1044 "psrlw $2, %%mm0\n\t" | |
1045 "psrlw $2, %%mm2\n\t" | |
1046 "psubsw %%mm0, %%mm1\n\t" | |
1047 "psubsw %%mm2, %%mm3\n\t" | |
1048 "movq %%mm1, %0\n\t" | |
1049 "movq %%mm3, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
1050 :"+m"(*p) |
0 | 1051 :"m"(*pix), |
1052 "m"(*(pix+line_size)) | |
1053 :"memory"); | |
1054 pix += line_size; | |
1055 p += 8 ; | |
1056 } while(--h); | |
1057 } | |
1058 | |
296 | 1059 static void clear_blocks_mmx(DCTELEM *blocks) |
1060 { | |
1061 asm volatile( | |
1062 "pxor %%mm7, %%mm7 \n\t" | |
1063 "movl $-128*6, %%eax \n\t" | |
1064 "1: \n\t" | |
1065 "movq %%mm7, (%0, %%eax) \n\t" | |
1066 "movq %%mm7, 8(%0, %%eax) \n\t" | |
1067 "movq %%mm7, 16(%0, %%eax) \n\t" | |
1068 "movq %%mm7, 24(%0, %%eax) \n\t" | |
1069 "addl $32, %%eax \n\t" | |
1070 " js 1b \n\t" | |
1071 : : "r" (((int)blocks)+128*6) | |
1072 : "%eax" | |
1073 ); | |
1074 } | |
1075 | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1076 static void just_return() { return; } |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1077 |
0 | 1078 void dsputil_init_mmx(void) |
1079 { | |
1080 mm_flags = mm_support(); | |
188 | 1081 #if 1 |
1082 printf("libavcodec: CPU flags:"); | |
0 | 1083 if (mm_flags & MM_MMX) |
1084 printf(" mmx"); | |
1085 if (mm_flags & MM_MMXEXT) | |
1086 printf(" mmxext"); | |
1087 if (mm_flags & MM_3DNOW) | |
1088 printf(" 3dnow"); | |
1089 if (mm_flags & MM_SSE) | |
1090 printf(" sse"); | |
1091 if (mm_flags & MM_SSE2) | |
1092 printf(" sse2"); | |
1093 printf("\n"); | |
1094 #endif | |
1095 | |
1096 if (mm_flags & MM_MMX) { | |
1097 get_pixels = get_pixels_mmx; | |
324 | 1098 diff_pixels = diff_pixels_mmx; |
0 | 1099 put_pixels_clamped = put_pixels_clamped_mmx; |
1100 add_pixels_clamped = add_pixels_clamped_mmx; | |
296 | 1101 clear_blocks= clear_blocks_mmx; |
1102 | |
294 | 1103 pix_abs16x16 = pix_abs16x16_mmx; |
1104 pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
1105 pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
0 | 1106 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; |
294 | 1107 pix_abs8x8 = pix_abs8x8_mmx; |
1108 pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | |
1109 pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | |
1110 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | |
0 | 1111 av_fdct = fdct_mmx; |
1112 | |
1113 put_pixels_tab[0] = put_pixels_mmx; | |
1114 put_pixels_tab[1] = put_pixels_x2_mmx; | |
1115 put_pixels_tab[2] = put_pixels_y2_mmx; | |
1116 put_pixels_tab[3] = put_pixels_xy2_mmx; | |
1117 | |
1118 put_no_rnd_pixels_tab[0] = put_pixels_mmx; | |
1119 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
1120 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1121 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | |
1122 | |
1123 avg_pixels_tab[0] = avg_pixels_mmx; | |
1124 avg_pixels_tab[1] = avg_pixels_x2_mmx; | |
1125 avg_pixels_tab[2] = avg_pixels_y2_mmx; | |
1126 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1127 | |
1128 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | |
1129 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | |
1130 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | |
1131 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | |
386 | 1132 |
0 | 1133 sub_pixels_tab[0] = sub_pixels_mmx; |
1134 sub_pixels_tab[1] = sub_pixels_x2_mmx; | |
1135 sub_pixels_tab[2] = sub_pixels_y2_mmx; | |
1136 sub_pixels_tab[3] = sub_pixels_xy2_mmx; | |
1137 | |
1138 if (mm_flags & MM_MMXEXT) { | |
294 | 1139 pix_abs16x16 = pix_abs16x16_mmx2; |
1140 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; | |
1141 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; | |
1142 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2; | |
1143 | |
1144 pix_abs8x8 = pix_abs8x8_mmx2; | |
1145 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; | |
1146 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; | |
1147 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; | |
386 | 1148 |
1149 put_pixels_tab[1] = put_pixels_x2_mmx2; | |
1150 put_pixels_tab[2] = put_pixels_y2_mmx2; | |
1151 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2; | |
1152 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2; | |
0 | 1153 |
386 | 1154 avg_pixels_tab[0] = avg_pixels_mmx2; |
1155 avg_pixels_tab[1] = avg_pixels_x2_mmx2; | |
1156 avg_pixels_tab[2] = avg_pixels_y2_mmx2; | |
1157 avg_pixels_tab[3] = avg_pixels_xy2_mmx2; | |
0 | 1158 |
386 | 1159 sub_pixels_tab[1] = sub_pixels_x2_mmx2; |
1160 sub_pixels_tab[2] = sub_pixels_y2_mmx2; | |
0 | 1161 } else if (mm_flags & MM_3DNOW) { |
1162 put_pixels_tab[1] = put_pixels_x2_3dnow; | |
1163 put_pixels_tab[2] = put_pixels_y2_3dnow; | |
386 | 1164 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow; |
1165 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow; | |
0 | 1166 |
1167 avg_pixels_tab[0] = avg_pixels_3dnow; | |
1168 avg_pixels_tab[1] = avg_pixels_x2_3dnow; | |
1169 avg_pixels_tab[2] = avg_pixels_y2_3dnow; | |
1170 avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | |
1171 | |
1172 sub_pixels_tab[1] = sub_pixels_x2_3dnow; | |
1173 sub_pixels_tab[2] = sub_pixels_y2_3dnow; | |
1174 } | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1175 |
42 | 1176 /* idct */ |
1177 if (mm_flags & MM_MMXEXT) { | |
1178 ff_idct = ff_mmxext_idct; | |
1179 } else { | |
1180 ff_idct = ff_mmx_idct; | |
1181 } | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1182 #ifdef SIMPLE_IDCT |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1183 // ff_idct = simple_idct; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1184 ff_idct = simple_idct_mmx; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1185 #endif |
0 | 1186 } |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1187 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1188 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1189 // for speed testing |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1190 get_pixels = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1191 put_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1192 add_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1193 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1194 pix_abs16x16 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1195 pix_abs16x16_x2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1196 pix_abs16x16_y2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1197 pix_abs16x16_xy2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1198 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1199 put_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1200 put_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1201 put_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1202 put_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1203 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1204 put_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1205 put_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1206 put_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1207 put_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1208 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1209 avg_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1210 avg_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1211 avg_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1212 avg_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1213 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1214 avg_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1215 avg_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1216 avg_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1217 avg_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1218 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1219 sub_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1220 sub_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1221 sub_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1222 sub_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1223 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1224 //av_fdct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1225 //ff_idct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1226 #endif |
0 | 1227 } |