Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 354:167aa21aa250 libavcodec
patch by Alex Beregszaszi <alex@naxine.org>
- AVID (AVRn) support (workaround)
- print error instead of failing for unsupported SOF
- fixed the 0<code<FF range checking
author | arpi_esp |
---|---|
date | Fri, 03 May 2002 16:34:40 +0000 |
parents | 8635a7036395 |
children | f49629bab18d |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
3 * Copyright (c) 2000, 2001 Gerard Lantau. | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
18 * | |
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 */ | |
21 | |
22 #include "../dsputil.h" | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
23 #include "../simple_idct.h" |
0 | 24 |
5 | 25 int mm_flags; /* multimedia extension flags */ |
26 | |
294 | 27 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx); |
28 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
29 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
30 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
31 | |
32 int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
33 int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
34 int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
35 int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
36 | |
37 int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
38 int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
39 int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
40 int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
41 | |
42 int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
43 int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
44 int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
45 int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
46 | |
42 | 47 /* external functions, from idct_mmx.c */ |
48 void ff_mmx_idct(DCTELEM *block); | |
49 void ff_mmxext_idct(DCTELEM *block); | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
50 |
0 | 51 /* pixel operations */ |
294 | 52 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001LL; |
53 static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002LL; | |
8 | 54 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; |
55 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |
0 | 56 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
57 #define JUMPALIGN() __asm __volatile (".balign 8"::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
58 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
59 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
60 #ifndef PIC |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
61 #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
62 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
63 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
64 // for shared library it's better to use this way for accessing constants |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
65 // pcmpeqd -> -1 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
66 #define MOVQ_WONE(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
67 __asm __volatile ( \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
68 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
69 "psrlw $15, %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
70 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
71 #define MOVQ_WTWO(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
72 __asm __volatile ( \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
73 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
74 "psrlw $15, %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
75 "psllw $1, %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
76 #endif |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
77 |
0 | 78 /***********************************/ |
79 /* 3Dnow specific */ | |
80 | |
81 #define DEF(x) x ## _3dnow | |
82 /* for Athlons PAVGUSB is prefered */ | |
83 #define PAVGB "pavgusb" | |
84 | |
85 #include "dsputil_mmx_avg.h" | |
86 | |
87 #undef DEF | |
88 #undef PAVGB | |
89 | |
90 /***********************************/ | |
91 /* MMX2 specific */ | |
92 | |
93 #define DEF(x) x ## _sse | |
94 | |
95 /* Introduced only in MMX2 set */ | |
96 #define PAVGB "pavgb" | |
97 | |
98 #include "dsputil_mmx_avg.h" | |
99 | |
100 #undef DEF | |
101 #undef PAVGB | |
102 | |
103 /***********************************/ | |
104 /* standard MMX */ | |
105 | |
106 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
107 { | |
108 DCTELEM *p; | |
109 const UINT8 *pix; | |
110 int i; | |
111 | |
112 /* read the pixels */ | |
113 p = block; | |
114 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
115 MOVQ_ZERO(mm7); |
0 | 116 for(i=0;i<4;i++) { |
117 __asm __volatile( | |
118 "movq %1, %%mm0\n\t" | |
119 "movq %2, %%mm1\n\t" | |
120 "movq %%mm0, %%mm2\n\t" | |
121 "movq %%mm1, %%mm3\n\t" | |
122 "punpcklbw %%mm7, %%mm0\n\t" | |
123 "punpckhbw %%mm7, %%mm2\n\t" | |
124 "punpcklbw %%mm7, %%mm1\n\t" | |
125 "punpckhbw %%mm7, %%mm3\n\t" | |
126 "movq %%mm0, %0\n\t" | |
127 "movq %%mm2, 8%0\n\t" | |
128 "movq %%mm1, 16%0\n\t" | |
129 "movq %%mm3, 24%0\n\t" | |
130 :"=m"(*p) | |
131 :"m"(*pix), "m"(*(pix+line_size)) | |
132 :"memory"); | |
133 pix += line_size*2; | |
134 p += 16; | |
135 } | |
136 } | |
137 | |
324 | 138 static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) |
139 { | |
140 asm volatile( | |
141 ".balign 16 \n\t" | |
142 "movl $-128, %%eax \n\t" | |
143 "1: \n\t" | |
144 "movq (%0), %%mm0 \n\t" | |
145 "movq (%1), %%mm2 \n\t" | |
146 "movq %%mm0, %%mm1 \n\t" | |
147 "movq %%mm2, %%mm3 \n\t" | |
148 "punpcklbw %%mm7, %%mm0 \n\t" | |
149 "punpckhbw %%mm7, %%mm1 \n\t" | |
150 "punpcklbw %%mm7, %%mm2 \n\t" | |
151 "punpckhbw %%mm7, %%mm3 \n\t" | |
152 "psubw %%mm2, %%mm0 \n\t" | |
153 "psubw %%mm3, %%mm1 \n\t" | |
154 "movq %%mm0, (%2, %%eax)\n\t" | |
155 "movq %%mm1, 8(%2, %%eax)\n\t" | |
156 "addl %3, %0 \n\t" | |
157 "addl %3, %1 \n\t" | |
158 "addl $16, %%eax \n\t" | |
159 "jnz 1b \n\t" | |
160 : "+r" (s1), "+r" (s2) | |
161 : "r" (block+64), "r" (stride) | |
162 : "%eax" | |
163 ); | |
164 } | |
165 | |
0 | 166 static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) |
167 { | |
168 const DCTELEM *p; | |
169 UINT8 *pix; | |
170 | |
171 /* read the pixels */ | |
172 p = block; | |
173 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
174 /* unrolled loop */ |
0 | 175 __asm __volatile( |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
176 "movq %3, %%mm0\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
177 "movq 8%3, %%mm1\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
178 "movq 16%3, %%mm2\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
179 "movq 24%3, %%mm3\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
180 "movq 32%3, %%mm4\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
181 "movq 40%3, %%mm5\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
182 "movq 48%3, %%mm6\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
183 "movq 56%3, %%mm7\n\t" |
0 | 184 "packuswb %%mm1, %%mm0\n\t" |
185 "packuswb %%mm3, %%mm2\n\t" | |
186 "packuswb %%mm5, %%mm4\n\t" | |
187 "packuswb %%mm7, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
188 "movq %%mm0, (%0)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
189 "movq %%mm2, (%0, %1)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
190 "movq %%mm4, (%0, %1, 2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
191 "movq %%mm6, (%0, %2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
192 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) |
0 | 193 :"memory"); |
194 pix += line_size*4; | |
195 p += 32; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
196 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
197 // if here would be an exact copy of the code above |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
198 // compiler would generate some very strange code |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
199 // thus using "r" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
200 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
201 "movq (%3), %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
202 "movq 8(%3), %%mm1\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
203 "movq 16(%3), %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
204 "movq 24(%3), %%mm3\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
205 "movq 32(%3), %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
206 "movq 40(%3), %%mm5\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
207 "movq 48(%3), %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
208 "movq 56(%3), %%mm7\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
209 "packuswb %%mm1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
210 "packuswb %%mm3, %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
211 "packuswb %%mm5, %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
212 "packuswb %%mm7, %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
213 "movq %%mm0, (%0)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
214 "movq %%mm2, (%0, %1)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
215 "movq %%mm4, (%0, %1, 2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
216 "movq %%mm6, (%0, %2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
217 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
218 :"memory"); |
0 | 219 } |
220 | |
221 static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
222 { | |
223 const DCTELEM *p; | |
224 UINT8 *pix; | |
225 int i; | |
226 | |
227 /* read the pixels */ | |
228 p = block; | |
229 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
230 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
231 i = 4; |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
232 do { |
0 | 233 __asm __volatile( |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
234 "movq (%2), %%mm0\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
235 "movq 8(%2), %%mm1\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
236 "movq 16(%2), %%mm2\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
237 "movq 24(%2), %%mm3\n\t" |
0 | 238 "movq %0, %%mm4\n\t" |
239 "movq %1, %%mm6\n\t" | |
240 "movq %%mm4, %%mm5\n\t" | |
241 "punpcklbw %%mm7, %%mm4\n\t" | |
242 "punpckhbw %%mm7, %%mm5\n\t" | |
243 "paddsw %%mm4, %%mm0\n\t" | |
244 "paddsw %%mm5, %%mm1\n\t" | |
245 "movq %%mm6, %%mm5\n\t" | |
246 "punpcklbw %%mm7, %%mm6\n\t" | |
247 "punpckhbw %%mm7, %%mm5\n\t" | |
248 "paddsw %%mm6, %%mm2\n\t" | |
249 "paddsw %%mm5, %%mm3\n\t" | |
250 "packuswb %%mm1, %%mm0\n\t" | |
251 "packuswb %%mm3, %%mm2\n\t" | |
252 "movq %%mm0, %0\n\t" | |
253 "movq %%mm2, %1\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
254 :"+m"(*pix), "+m"(*(pix+line_size)) |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
255 :"r"(p) |
0 | 256 :"memory"); |
257 pix += line_size*2; | |
258 p += 16; | |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
259 } while (--i); |
0 | 260 } |
261 | |
262 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
263 { | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
264 int hh; |
0 | 265 UINT8 *p; |
266 const UINT8 *pix; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
267 |
0 | 268 p = block; |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
269 pix = pixels; // 2s |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
270 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
271 do { |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
272 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
273 "movq %1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
274 "movq %%mm0, %0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
275 :"=m"(*p) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
276 :"m"(*pix) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
277 :"memory"); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
278 pix += line_size; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
279 p += line_size; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
280 } while (--h); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
281 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
282 // this optimized code is not very usefull |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
283 // the above loop is definitely faster |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
284 // at least on Celeron 500MHz |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
285 hh = h & 3; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
286 while (hh) { |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
287 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
288 "movq %1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
289 "movq %%mm0, %0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
290 :"=m"(*p) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
291 :"m"(*pix) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
292 :"memory"); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
293 pix += line_size; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
294 p += line_size; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
295 hh--; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
296 } |
0 | 297 hh=h>>2; |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
298 while (hh) { |
0 | 299 __asm __volatile( |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
300 "movq (%1), %%mm0 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
301 "movq (%1, %2), %%mm1 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
302 "movq (%1, %2, 2), %%mm2 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
303 "movq (%1, %3), %%mm3 \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
304 "movq %%mm0, (%0) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
305 "movq %%mm1, (%0, %2) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
306 "movq %%mm2, (%0, %2, 2) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
307 "movq %%mm3, (%0, %3) \n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
308 ::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3) |
0 | 309 :"memory"); |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
310 pix += line_size*4; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
311 p += line_size*4; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
312 hh--; |
0 | 313 } |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
314 #endif |
0 | 315 } |
316 | |
317 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
318 { | |
319 UINT8 *p; | |
320 const UINT8 *pix; | |
321 p = block; | |
322 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
323 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
324 MOVQ_WONE(mm4); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
325 JUMPALIGN(); |
0 | 326 do { |
327 __asm __volatile( | |
328 "movq %1, %%mm0\n\t" | |
329 "movq 1%1, %%mm1\n\t" | |
330 "movq %%mm0, %%mm2\n\t" | |
331 "movq %%mm1, %%mm3\n\t" | |
332 "punpcklbw %%mm7, %%mm0\n\t" | |
333 "punpcklbw %%mm7, %%mm1\n\t" | |
334 "punpckhbw %%mm7, %%mm2\n\t" | |
335 "punpckhbw %%mm7, %%mm3\n\t" | |
336 "paddusw %%mm1, %%mm0\n\t" | |
337 "paddusw %%mm3, %%mm2\n\t" | |
338 "paddusw %%mm4, %%mm0\n\t" | |
339 "paddusw %%mm4, %%mm2\n\t" | |
340 "psrlw $1, %%mm0\n\t" | |
341 "psrlw $1, %%mm2\n\t" | |
342 "packuswb %%mm2, %%mm0\n\t" | |
343 "movq %%mm0, %0\n\t" | |
344 :"=m"(*p) | |
345 :"m"(*pix) | |
346 :"memory"); | |
347 pix += line_size; p += line_size; | |
348 } while (--h); | |
349 } | |
350 | |
351 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
352 { | |
353 UINT8 *p; | |
354 const UINT8 *pix; | |
355 p = block; | |
356 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
357 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
358 MOVQ_WONE(mm4); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
359 JUMPALIGN(); |
0 | 360 do { |
361 __asm __volatile( | |
362 "movq %1, %%mm0\n\t" | |
363 "movq %2, %%mm1\n\t" | |
364 "movq %%mm0, %%mm2\n\t" | |
365 "movq %%mm1, %%mm3\n\t" | |
366 "punpcklbw %%mm7, %%mm0\n\t" | |
367 "punpcklbw %%mm7, %%mm1\n\t" | |
368 "punpckhbw %%mm7, %%mm2\n\t" | |
369 "punpckhbw %%mm7, %%mm3\n\t" | |
370 "paddusw %%mm1, %%mm0\n\t" | |
371 "paddusw %%mm3, %%mm2\n\t" | |
372 "paddusw %%mm4, %%mm0\n\t" | |
373 "paddusw %%mm4, %%mm2\n\t" | |
374 "psrlw $1, %%mm0\n\t" | |
375 "psrlw $1, %%mm2\n\t" | |
376 "packuswb %%mm2, %%mm0\n\t" | |
377 "movq %%mm0, %0\n\t" | |
378 :"=m"(*p) | |
379 :"m"(*pix), | |
380 "m"(*(pix+line_size)) | |
381 :"memory"); | |
382 pix += line_size; | |
383 p += line_size; | |
384 } while (--h); | |
385 } | |
386 | |
387 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
388 { | |
389 UINT8 *p; | |
390 const UINT8 *pix; | |
391 p = block; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
392 pix = pixels; // 1s |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
393 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
394 MOVQ_WTWO(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
395 JUMPALIGN(); |
0 | 396 do { |
397 __asm __volatile( | |
398 "movq %1, %%mm0\n\t" | |
399 "movq %2, %%mm1\n\t" | |
400 "movq 1%1, %%mm4\n\t" | |
401 "movq 1%2, %%mm5\n\t" | |
402 "movq %%mm0, %%mm2\n\t" | |
403 "movq %%mm1, %%mm3\n\t" | |
404 "punpcklbw %%mm7, %%mm0\n\t" | |
405 "punpcklbw %%mm7, %%mm1\n\t" | |
406 "punpckhbw %%mm7, %%mm2\n\t" | |
407 "punpckhbw %%mm7, %%mm3\n\t" | |
408 "paddusw %%mm1, %%mm0\n\t" | |
409 "paddusw %%mm3, %%mm2\n\t" | |
410 "movq %%mm4, %%mm1\n\t" | |
411 "movq %%mm5, %%mm3\n\t" | |
412 "punpcklbw %%mm7, %%mm4\n\t" | |
413 "punpcklbw %%mm7, %%mm5\n\t" | |
414 "punpckhbw %%mm7, %%mm1\n\t" | |
415 "punpckhbw %%mm7, %%mm3\n\t" | |
416 "paddusw %%mm5, %%mm4\n\t" | |
417 "paddusw %%mm3, %%mm1\n\t" | |
418 "paddusw %%mm6, %%mm4\n\t" | |
419 "paddusw %%mm6, %%mm1\n\t" | |
420 "paddusw %%mm4, %%mm0\n\t" | |
421 "paddusw %%mm1, %%mm2\n\t" | |
422 "psrlw $2, %%mm0\n\t" | |
423 "psrlw $2, %%mm2\n\t" | |
424 "packuswb %%mm2, %%mm0\n\t" | |
425 "movq %%mm0, %0\n\t" | |
426 :"=m"(*p) | |
427 :"m"(*pix), | |
428 "m"(*(pix+line_size)) | |
429 :"memory"); | |
430 pix += line_size; | |
431 p += line_size; | |
432 } while(--h); | |
433 } | |
434 | |
435 static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
436 { | |
437 UINT8 *p; | |
438 const UINT8 *pix; | |
439 p = block; | |
440 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
441 MOVQ_ZERO(mm7); |
0 | 442 do { |
443 __asm __volatile( | |
444 "movq %1, %%mm0\n\t" | |
445 "movq 1%1, %%mm1\n\t" | |
446 "movq %%mm0, %%mm2\n\t" | |
447 "movq %%mm1, %%mm3\n\t" | |
448 "punpcklbw %%mm7, %%mm0\n\t" | |
449 "punpcklbw %%mm7, %%mm1\n\t" | |
450 "punpckhbw %%mm7, %%mm2\n\t" | |
451 "punpckhbw %%mm7, %%mm3\n\t" | |
452 "paddusw %%mm1, %%mm0\n\t" | |
453 "paddusw %%mm3, %%mm2\n\t" | |
454 "psrlw $1, %%mm0\n\t" | |
455 "psrlw $1, %%mm2\n\t" | |
456 "packuswb %%mm2, %%mm0\n\t" | |
457 "movq %%mm0, %0\n\t" | |
458 :"=m"(*p) | |
459 :"m"(*pix) | |
460 :"memory"); | |
461 pix += line_size; | |
462 p += line_size; | |
463 } while (--h); | |
464 } | |
465 | |
466 static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
467 { | |
468 UINT8 *p; | |
469 const UINT8 *pix; | |
470 p = block; | |
471 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
472 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
473 JUMPALIGN(); |
0 | 474 do { |
475 __asm __volatile( | |
476 "movq %1, %%mm0\n\t" | |
477 "movq %2, %%mm1\n\t" | |
478 "movq %%mm0, %%mm2\n\t" | |
479 "movq %%mm1, %%mm3\n\t" | |
480 "punpcklbw %%mm7, %%mm0\n\t" | |
481 "punpcklbw %%mm7, %%mm1\n\t" | |
482 "punpckhbw %%mm7, %%mm2\n\t" | |
483 "punpckhbw %%mm7, %%mm3\n\t" | |
484 "paddusw %%mm1, %%mm0\n\t" | |
485 "paddusw %%mm3, %%mm2\n\t" | |
486 "psrlw $1, %%mm0\n\t" | |
487 "psrlw $1, %%mm2\n\t" | |
488 "packuswb %%mm2, %%mm0\n\t" | |
489 "movq %%mm0, %0\n\t" | |
490 :"=m"(*p) | |
491 :"m"(*pix), | |
492 "m"(*(pix+line_size)) | |
493 :"memory"); | |
494 pix += line_size; | |
495 p += line_size; | |
496 } while(--h); | |
497 } | |
498 | |
499 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
500 { | |
501 UINT8 *p; | |
502 const UINT8 *pix; | |
503 p = block; | |
504 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
505 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
506 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
507 JUMPALIGN(); |
0 | 508 do { |
509 __asm __volatile( | |
510 "movq %1, %%mm0\n\t" | |
511 "movq %2, %%mm1\n\t" | |
512 "movq 1%1, %%mm4\n\t" | |
513 "movq 1%2, %%mm5\n\t" | |
514 "movq %%mm0, %%mm2\n\t" | |
515 "movq %%mm1, %%mm3\n\t" | |
516 "punpcklbw %%mm7, %%mm0\n\t" | |
517 "punpcklbw %%mm7, %%mm1\n\t" | |
518 "punpckhbw %%mm7, %%mm2\n\t" | |
519 "punpckhbw %%mm7, %%mm3\n\t" | |
520 "paddusw %%mm1, %%mm0\n\t" | |
521 "paddusw %%mm3, %%mm2\n\t" | |
522 "movq %%mm4, %%mm1\n\t" | |
523 "movq %%mm5, %%mm3\n\t" | |
524 "punpcklbw %%mm7, %%mm4\n\t" | |
525 "punpcklbw %%mm7, %%mm5\n\t" | |
526 "punpckhbw %%mm7, %%mm1\n\t" | |
527 "punpckhbw %%mm7, %%mm3\n\t" | |
528 "paddusw %%mm5, %%mm4\n\t" | |
529 "paddusw %%mm3, %%mm1\n\t" | |
530 "paddusw %%mm6, %%mm4\n\t" | |
531 "paddusw %%mm6, %%mm1\n\t" | |
532 "paddusw %%mm4, %%mm0\n\t" | |
533 "paddusw %%mm1, %%mm2\n\t" | |
534 "psrlw $2, %%mm0\n\t" | |
535 "psrlw $2, %%mm2\n\t" | |
536 "packuswb %%mm2, %%mm0\n\t" | |
537 "movq %%mm0, %0\n\t" | |
538 :"=m"(*p) | |
539 :"m"(*pix), | |
540 "m"(*(pix+line_size)) | |
541 :"memory"); | |
542 pix += line_size; | |
543 p += line_size; | |
544 } while(--h); | |
545 } | |
546 | |
547 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
548 { | |
549 UINT8 *p; | |
550 const UINT8 *pix; | |
551 p = block; | |
552 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
553 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
554 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
555 JUMPALIGN(); |
0 | 556 do { |
557 __asm __volatile( | |
558 "movq %0, %%mm0\n\t" | |
559 "movq %1, %%mm1\n\t" | |
560 "movq %%mm0, %%mm2\n\t" | |
561 "movq %%mm1, %%mm3\n\t" | |
562 "punpcklbw %%mm7, %%mm0\n\t" | |
563 "punpcklbw %%mm7, %%mm1\n\t" | |
564 "punpckhbw %%mm7, %%mm2\n\t" | |
565 "punpckhbw %%mm7, %%mm3\n\t" | |
566 "paddusw %%mm1, %%mm0\n\t" | |
567 "paddusw %%mm3, %%mm2\n\t" | |
568 "paddusw %%mm6, %%mm0\n\t" | |
569 "paddusw %%mm6, %%mm2\n\t" | |
570 "psrlw $1, %%mm0\n\t" | |
571 "psrlw $1, %%mm2\n\t" | |
572 "packuswb %%mm2, %%mm0\n\t" | |
573 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
574 :"+m"(*p) |
0 | 575 :"m"(*pix) |
576 :"memory"); | |
577 pix += line_size; | |
578 p += line_size; | |
579 } | |
580 while (--h); | |
581 } | |
582 | |
583 static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
584 { | |
585 UINT8 *p; | |
586 const UINT8 *pix; | |
587 p = block; | |
588 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
589 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
590 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
591 JUMPALIGN(); |
0 | 592 do { |
593 __asm __volatile( | |
594 "movq %1, %%mm1\n\t" | |
595 "movq %0, %%mm0\n\t" | |
596 "movq 1%1, %%mm4\n\t" | |
597 "movq %%mm0, %%mm2\n\t" | |
598 "movq %%mm1, %%mm3\n\t" | |
599 "movq %%mm4, %%mm5\n\t" | |
600 "punpcklbw %%mm7, %%mm1\n\t" | |
601 "punpckhbw %%mm7, %%mm3\n\t" | |
602 "punpcklbw %%mm7, %%mm4\n\t" | |
603 "punpckhbw %%mm7, %%mm5\n\t" | |
604 "punpcklbw %%mm7, %%mm0\n\t" | |
605 "punpckhbw %%mm7, %%mm2\n\t" | |
606 "paddusw %%mm4, %%mm1\n\t" | |
607 "paddusw %%mm5, %%mm3\n\t" | |
608 "paddusw %%mm6, %%mm1\n\t" | |
609 "paddusw %%mm6, %%mm3\n\t" | |
610 "psrlw $1, %%mm1\n\t" | |
611 "psrlw $1, %%mm3\n\t" | |
612 "paddusw %%mm6, %%mm0\n\t" | |
613 "paddusw %%mm6, %%mm2\n\t" | |
614 "paddusw %%mm1, %%mm0\n\t" | |
615 "paddusw %%mm3, %%mm2\n\t" | |
616 "psrlw $1, %%mm0\n\t" | |
617 "psrlw $1, %%mm2\n\t" | |
618 "packuswb %%mm2, %%mm0\n\t" | |
619 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
620 :"+m"(*p) |
0 | 621 :"m"(*pix) |
622 :"memory"); | |
623 pix += line_size; | |
624 p += line_size; | |
625 } while (--h); | |
626 } | |
627 | |
628 static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
629 { | |
630 UINT8 *p; | |
631 const UINT8 *pix; | |
632 p = block; | |
633 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
634 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
635 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
636 JUMPALIGN(); |
0 | 637 do { |
638 __asm __volatile( | |
639 "movq %1, %%mm1\n\t" | |
640 "movq %0, %%mm0\n\t" | |
641 "movq %2, %%mm4\n\t" | |
642 "movq %%mm0, %%mm2\n\t" | |
643 "movq %%mm1, %%mm3\n\t" | |
644 "movq %%mm4, %%mm5\n\t" | |
645 "punpcklbw %%mm7, %%mm1\n\t" | |
646 "punpckhbw %%mm7, %%mm3\n\t" | |
647 "punpcklbw %%mm7, %%mm4\n\t" | |
648 "punpckhbw %%mm7, %%mm5\n\t" | |
649 "punpcklbw %%mm7, %%mm0\n\t" | |
650 "punpckhbw %%mm7, %%mm2\n\t" | |
651 "paddusw %%mm4, %%mm1\n\t" | |
652 "paddusw %%mm5, %%mm3\n\t" | |
653 "paddusw %%mm6, %%mm1\n\t" | |
654 "paddusw %%mm6, %%mm3\n\t" | |
655 "psrlw $1, %%mm1\n\t" | |
656 "psrlw $1, %%mm3\n\t" | |
657 "paddusw %%mm6, %%mm0\n\t" | |
658 "paddusw %%mm6, %%mm2\n\t" | |
659 "paddusw %%mm1, %%mm0\n\t" | |
660 "paddusw %%mm3, %%mm2\n\t" | |
661 "psrlw $1, %%mm0\n\t" | |
662 "psrlw $1, %%mm2\n\t" | |
663 "packuswb %%mm2, %%mm0\n\t" | |
664 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
665 :"+m"(*p) |
0 | 666 :"m"(*pix), "m"(*(pix+line_size)) |
667 :"memory"); | |
668 pix += line_size; | |
669 p += line_size ; | |
670 } while(--h); | |
671 } | |
672 | |
673 static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
674 { | |
675 UINT8 *p; | |
676 const UINT8 *pix; | |
677 p = block; | |
678 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
679 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
680 // this doesn't seem to be used offten - so |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
681 // the inside usage of mm_wone is not optimized |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
682 MOVQ_WTWO(mm6); |
0 | 683 do { |
684 __asm __volatile( | |
685 "movq %1, %%mm0\n\t" | |
686 "movq %2, %%mm1\n\t" | |
687 "movq 1%1, %%mm4\n\t" | |
688 "movq 1%2, %%mm5\n\t" | |
689 "movq %%mm0, %%mm2\n\t" | |
690 "movq %%mm1, %%mm3\n\t" | |
691 "punpcklbw %%mm7, %%mm0\n\t" | |
692 "punpcklbw %%mm7, %%mm1\n\t" | |
693 "punpckhbw %%mm7, %%mm2\n\t" | |
694 "punpckhbw %%mm7, %%mm3\n\t" | |
695 "paddusw %%mm1, %%mm0\n\t" | |
696 "paddusw %%mm3, %%mm2\n\t" | |
697 "movq %%mm4, %%mm1\n\t" | |
698 "movq %%mm5, %%mm3\n\t" | |
699 "punpcklbw %%mm7, %%mm4\n\t" | |
700 "punpcklbw %%mm7, %%mm5\n\t" | |
701 "punpckhbw %%mm7, %%mm1\n\t" | |
702 "punpckhbw %%mm7, %%mm3\n\t" | |
703 "paddusw %%mm5, %%mm4\n\t" | |
704 "paddusw %%mm3, %%mm1\n\t" | |
705 "paddusw %%mm6, %%mm4\n\t" | |
706 "paddusw %%mm6, %%mm1\n\t" | |
707 "paddusw %%mm4, %%mm0\n\t" | |
708 "paddusw %%mm1, %%mm2\n\t" | |
709 "movq %3, %%mm5\n\t" | |
710 "psrlw $2, %%mm0\n\t" | |
711 "movq %0, %%mm1\n\t" | |
712 "psrlw $2, %%mm2\n\t" | |
713 "movq %%mm1, %%mm3\n\t" | |
714 "punpcklbw %%mm7, %%mm1\n\t" | |
715 "punpckhbw %%mm7, %%mm3\n\t" | |
716 "paddusw %%mm1, %%mm0\n\t" | |
717 "paddusw %%mm3, %%mm2\n\t" | |
718 "paddusw %%mm5, %%mm0\n\t" | |
719 "paddusw %%mm5, %%mm2\n\t" | |
720 "psrlw $1, %%mm0\n\t" | |
721 "psrlw $1, %%mm2\n\t" | |
722 "packuswb %%mm2, %%mm0\n\t" | |
723 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
724 :"+m"(*p) |
0 | 725 :"m"(*pix), |
8 | 726 "m"(*(pix+line_size)), "m"(mm_wone) |
0 | 727 :"memory"); |
728 pix += line_size; | |
729 p += line_size ; | |
730 } while(--h); | |
731 } | |
732 | |
733 static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
734 { | |
735 UINT8 *p; | |
736 const UINT8 *pix; | |
737 p = block; | |
738 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
739 MOVQ_ZERO(mm7); |
0 | 740 do { |
741 __asm __volatile( | |
742 "movq %1, %%mm0\n\t" | |
743 "movq %0, %%mm1\n\t" | |
744 "movq %%mm0, %%mm2\n\t" | |
745 "movq %%mm1, %%mm3\n\t" | |
746 "punpcklbw %%mm7, %%mm0\n\t" | |
747 "punpcklbw %%mm7, %%mm1\n\t" | |
748 "punpckhbw %%mm7, %%mm2\n\t" | |
749 "punpckhbw %%mm7, %%mm3\n\t" | |
750 "paddusw %%mm1, %%mm0\n\t" | |
751 "paddusw %%mm3, %%mm2\n\t" | |
752 "psrlw $1, %%mm0\n\t" | |
753 "psrlw $1, %%mm2\n\t" | |
754 "packuswb %%mm2, %%mm0\n\t" | |
755 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
756 :"+m"(*p) |
0 | 757 :"m"(*pix) |
758 :"memory"); | |
759 pix += line_size; | |
760 p += line_size ; | |
761 } while (--h); | |
762 } | |
763 | |
764 static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
765 { | |
766 UINT8 *p; | |
767 const UINT8 *pix; | |
768 p = block; | |
769 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
770 MOVQ_ZERO(mm7); |
0 | 771 do { |
772 __asm __volatile( | |
773 "movq %1, %%mm0\n\t" | |
774 "movq 1%1, %%mm1\n\t" | |
775 "movq %0, %%mm4\n\t" | |
776 "movq %%mm0, %%mm2\n\t" | |
777 "movq %%mm1, %%mm3\n\t" | |
778 "movq %%mm4, %%mm5\n\t" | |
779 "punpcklbw %%mm7, %%mm0\n\t" | |
780 "punpcklbw %%mm7, %%mm1\n\t" | |
781 "punpckhbw %%mm7, %%mm2\n\t" | |
782 "punpckhbw %%mm7, %%mm3\n\t" | |
783 "punpcklbw %%mm7, %%mm4\n\t" | |
784 "punpckhbw %%mm7, %%mm5\n\t" | |
785 "paddusw %%mm1, %%mm0\n\t" | |
786 "paddusw %%mm3, %%mm2\n\t" | |
787 "psrlw $1, %%mm0\n\t" | |
788 "psrlw $1, %%mm2\n\t" | |
789 "paddusw %%mm4, %%mm0\n\t" | |
790 "paddusw %%mm5, %%mm2\n\t" | |
791 "psrlw $1, %%mm0\n\t" | |
792 "psrlw $1, %%mm2\n\t" | |
793 "packuswb %%mm2, %%mm0\n\t" | |
794 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
795 :"+m"(*p) |
0 | 796 :"m"(*pix) |
797 :"memory"); | |
798 pix += line_size; | |
799 p += line_size; | |
800 } while (--h); | |
801 } | |
802 | |
803 static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
804 { | |
805 UINT8 *p; | |
806 const UINT8 *pix; | |
807 p = block; | |
808 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
809 MOVQ_ZERO(mm7); |
0 | 810 do { |
811 __asm __volatile( | |
812 "movq %1, %%mm0\n\t" | |
813 "movq %2, %%mm1\n\t" | |
814 "movq %0, %%mm4\n\t" | |
815 "movq %%mm0, %%mm2\n\t" | |
816 "movq %%mm1, %%mm3\n\t" | |
817 "movq %%mm4, %%mm5\n\t" | |
818 "punpcklbw %%mm7, %%mm0\n\t" | |
819 "punpcklbw %%mm7, %%mm1\n\t" | |
820 "punpckhbw %%mm7, %%mm2\n\t" | |
821 "punpckhbw %%mm7, %%mm3\n\t" | |
822 "punpcklbw %%mm7, %%mm4\n\t" | |
823 "punpckhbw %%mm7, %%mm5\n\t" | |
824 "paddusw %%mm1, %%mm0\n\t" | |
825 "paddusw %%mm3, %%mm2\n\t" | |
826 "psrlw $1, %%mm0\n\t" | |
827 "psrlw $1, %%mm2\n\t" | |
828 "paddusw %%mm4, %%mm0\n\t" | |
829 "paddusw %%mm5, %%mm2\n\t" | |
830 "psrlw $1, %%mm0\n\t" | |
831 "psrlw $1, %%mm2\n\t" | |
832 "packuswb %%mm2, %%mm0\n\t" | |
833 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
834 :"+m"(*p) |
0 | 835 :"m"(*pix), "m"(*(pix+line_size)) |
836 :"memory"); | |
837 pix += line_size; | |
838 p += line_size ; | |
839 } while(--h); | |
840 } | |
841 | |
842 static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
843 { | |
844 UINT8 *p; | |
845 const UINT8 *pix; | |
846 p = block; | |
847 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
848 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
849 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
850 JUMPALIGN(); |
0 | 851 do { |
852 __asm __volatile( | |
853 "movq %1, %%mm0\n\t" | |
854 "movq %2, %%mm1\n\t" | |
855 "movq 1%1, %%mm4\n\t" | |
856 "movq 1%2, %%mm5\n\t" | |
857 "movq %%mm0, %%mm2\n\t" | |
858 "movq %%mm1, %%mm3\n\t" | |
859 "punpcklbw %%mm7, %%mm0\n\t" | |
860 "punpcklbw %%mm7, %%mm1\n\t" | |
861 "punpckhbw %%mm7, %%mm2\n\t" | |
862 "punpckhbw %%mm7, %%mm3\n\t" | |
863 "paddusw %%mm1, %%mm0\n\t" | |
864 "paddusw %%mm3, %%mm2\n\t" | |
865 "movq %%mm4, %%mm1\n\t" | |
866 "movq %%mm5, %%mm3\n\t" | |
867 "punpcklbw %%mm7, %%mm4\n\t" | |
868 "punpcklbw %%mm7, %%mm5\n\t" | |
869 "punpckhbw %%mm7, %%mm1\n\t" | |
870 "punpckhbw %%mm7, %%mm3\n\t" | |
871 "paddusw %%mm5, %%mm4\n\t" | |
872 "paddusw %%mm3, %%mm1\n\t" | |
873 "paddusw %%mm6, %%mm4\n\t" | |
874 "paddusw %%mm6, %%mm1\n\t" | |
875 "paddusw %%mm4, %%mm0\n\t" | |
876 "paddusw %%mm1, %%mm2\n\t" | |
877 "movq %0, %%mm1\n\t" | |
878 "psrlw $2, %%mm0\n\t" | |
879 "movq %%mm1, %%mm3\n\t" | |
880 "psrlw $2, %%mm2\n\t" | |
881 "punpcklbw %%mm7, %%mm1\n\t" | |
882 "punpckhbw %%mm7, %%mm3\n\t" | |
883 "paddusw %%mm1, %%mm0\n\t" | |
884 "paddusw %%mm3, %%mm2\n\t" | |
885 "psrlw $1, %%mm0\n\t" | |
886 "psrlw $1, %%mm2\n\t" | |
887 "packuswb %%mm2, %%mm0\n\t" | |
888 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
889 :"+m"(*p) |
0 | 890 :"m"(*pix), |
891 "m"(*(pix+line_size)) | |
892 :"memory"); | |
893 pix += line_size; | |
894 p += line_size; | |
895 } while(--h); | |
896 } | |
897 | |
898 static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
899 { | |
900 DCTELEM *p; | |
901 const UINT8 *pix; | |
902 p = block; | |
903 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
904 MOVQ_ZERO(mm7); |
0 | 905 do { |
906 __asm __volatile( | |
907 "movq %0, %%mm0\n\t" | |
908 "movq %1, %%mm2\n\t" | |
909 "movq 8%0, %%mm1\n\t" | |
910 "movq %%mm2, %%mm3\n\t" | |
911 "punpcklbw %%mm7, %%mm2\n\t" | |
912 "punpckhbw %%mm7, %%mm3\n\t" | |
913 "psubsw %%mm2, %%mm0\n\t" | |
914 "psubsw %%mm3, %%mm1\n\t" | |
915 "movq %%mm0, %0\n\t" | |
916 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
917 :"+m"(*p) |
0 | 918 :"m"(*pix) |
919 :"memory"); | |
920 pix += line_size; | |
921 p += 8; | |
922 } while (--h); | |
923 } | |
924 | |
925 static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
926 { | |
927 DCTELEM *p; | |
928 const UINT8 *pix; | |
929 p = block; | |
930 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
931 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
932 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
933 JUMPALIGN(); |
0 | 934 do { |
935 __asm __volatile( | |
936 "movq %0, %%mm0\n\t" | |
937 "movq %1, %%mm2\n\t" | |
938 "movq 8%0, %%mm1\n\t" | |
939 "movq 1%1, %%mm4\n\t" | |
940 "movq %%mm2, %%mm3\n\t" | |
941 "movq %%mm4, %%mm5\n\t" | |
942 "punpcklbw %%mm7, %%mm2\n\t" | |
943 "punpckhbw %%mm7, %%mm3\n\t" | |
944 "punpcklbw %%mm7, %%mm4\n\t" | |
945 "punpckhbw %%mm7, %%mm5\n\t" | |
946 "paddusw %%mm4, %%mm2\n\t" | |
947 "paddusw %%mm5, %%mm3\n\t" | |
948 "paddusw %%mm6, %%mm2\n\t" | |
949 "paddusw %%mm6, %%mm3\n\t" | |
950 "psrlw $1, %%mm2\n\t" | |
951 "psrlw $1, %%mm3\n\t" | |
952 "psubsw %%mm2, %%mm0\n\t" | |
953 "psubsw %%mm3, %%mm1\n\t" | |
954 "movq %%mm0, %0\n\t" | |
955 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
956 :"+m"(*p) |
0 | 957 :"m"(*pix) |
958 :"memory"); | |
959 pix += line_size; | |
960 p += 8; | |
961 } while (--h); | |
962 } | |
963 | |
964 static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
965 { | |
966 DCTELEM *p; | |
967 const UINT8 *pix; | |
968 p = block; | |
969 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
970 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
971 MOVQ_WONE(mm6); |
0 | 972 do { |
973 __asm __volatile( | |
974 "movq %0, %%mm0\n\t" | |
975 "movq %1, %%mm2\n\t" | |
976 "movq 8%0, %%mm1\n\t" | |
977 "movq %2, %%mm4\n\t" | |
978 "movq %%mm2, %%mm3\n\t" | |
979 "movq %%mm4, %%mm5\n\t" | |
980 "punpcklbw %%mm7, %%mm2\n\t" | |
981 "punpckhbw %%mm7, %%mm3\n\t" | |
982 "punpcklbw %%mm7, %%mm4\n\t" | |
983 "punpckhbw %%mm7, %%mm5\n\t" | |
984 "paddusw %%mm4, %%mm2\n\t" | |
985 "paddusw %%mm5, %%mm3\n\t" | |
986 "paddusw %%mm6, %%mm2\n\t" | |
987 "paddusw %%mm6, %%mm3\n\t" | |
988 "psrlw $1, %%mm2\n\t" | |
989 "psrlw $1, %%mm3\n\t" | |
990 "psubsw %%mm2, %%mm0\n\t" | |
991 "psubsw %%mm3, %%mm1\n\t" | |
992 "movq %%mm0, %0\n\t" | |
993 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
994 :"+m"(*p) |
0 | 995 :"m"(*pix), "m"(*(pix+line_size)) |
996 :"memory"); | |
997 pix += line_size; | |
998 p += 8; | |
999 } while (--h); | |
1000 } | |
1001 | |
1002 static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
1003 { | |
1004 DCTELEM *p; | |
1005 const UINT8 *pix; | |
1006 p = block; | |
1007 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1008 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1009 MOVQ_WTWO(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1010 JUMPALIGN(); |
0 | 1011 do { |
1012 __asm __volatile( | |
1013 "movq %1, %%mm0\n\t" | |
1014 "movq %2, %%mm1\n\t" | |
1015 "movq 1%1, %%mm4\n\t" | |
1016 "movq 1%2, %%mm5\n\t" | |
1017 "movq %%mm0, %%mm2\n\t" | |
1018 "movq %%mm1, %%mm3\n\t" | |
1019 "punpcklbw %%mm7, %%mm0\n\t" | |
1020 "punpcklbw %%mm7, %%mm1\n\t" | |
1021 "punpckhbw %%mm7, %%mm2\n\t" | |
1022 "punpckhbw %%mm7, %%mm3\n\t" | |
1023 "paddusw %%mm1, %%mm0\n\t" | |
1024 "paddusw %%mm3, %%mm2\n\t" | |
1025 "movq %%mm4, %%mm1\n\t" | |
1026 "movq %%mm5, %%mm3\n\t" | |
1027 "punpcklbw %%mm7, %%mm4\n\t" | |
1028 "punpcklbw %%mm7, %%mm5\n\t" | |
1029 "punpckhbw %%mm7, %%mm1\n\t" | |
1030 "punpckhbw %%mm7, %%mm3\n\t" | |
1031 "paddusw %%mm5, %%mm4\n\t" | |
1032 "paddusw %%mm3, %%mm1\n\t" | |
1033 "paddusw %%mm6, %%mm4\n\t" | |
1034 "paddusw %%mm6, %%mm1\n\t" | |
1035 "paddusw %%mm4, %%mm0\n\t" | |
1036 "paddusw %%mm1, %%mm2\n\t" | |
1037 "movq %0, %%mm1\n\t" | |
1038 "movq 8%0, %%mm3\n\t" | |
1039 "psrlw $2, %%mm0\n\t" | |
1040 "psrlw $2, %%mm2\n\t" | |
1041 "psubsw %%mm0, %%mm1\n\t" | |
1042 "psubsw %%mm2, %%mm3\n\t" | |
1043 "movq %%mm1, %0\n\t" | |
1044 "movq %%mm3, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
1045 :"+m"(*p) |
0 | 1046 :"m"(*pix), |
1047 "m"(*(pix+line_size)) | |
1048 :"memory"); | |
1049 pix += line_size; | |
1050 p += 8 ; | |
1051 } while(--h); | |
1052 } | |
1053 | |
296 | 1054 static void clear_blocks_mmx(DCTELEM *blocks) |
1055 { | |
1056 asm volatile( | |
1057 "pxor %%mm7, %%mm7 \n\t" | |
1058 "movl $-128*6, %%eax \n\t" | |
1059 "1: \n\t" | |
1060 "movq %%mm7, (%0, %%eax) \n\t" | |
1061 "movq %%mm7, 8(%0, %%eax) \n\t" | |
1062 "movq %%mm7, 16(%0, %%eax) \n\t" | |
1063 "movq %%mm7, 24(%0, %%eax) \n\t" | |
1064 "addl $32, %%eax \n\t" | |
1065 " js 1b \n\t" | |
1066 : : "r" (((int)blocks)+128*6) | |
1067 : "%eax" | |
1068 ); | |
1069 } | |
1070 | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1071 static void just_return() { return; } |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1072 |
0 | 1073 void dsputil_init_mmx(void) |
1074 { | |
1075 mm_flags = mm_support(); | |
188 | 1076 #if 1 |
1077 printf("libavcodec: CPU flags:"); | |
0 | 1078 if (mm_flags & MM_MMX) |
1079 printf(" mmx"); | |
1080 if (mm_flags & MM_MMXEXT) | |
1081 printf(" mmxext"); | |
1082 if (mm_flags & MM_3DNOW) | |
1083 printf(" 3dnow"); | |
1084 if (mm_flags & MM_SSE) | |
1085 printf(" sse"); | |
1086 if (mm_flags & MM_SSE2) | |
1087 printf(" sse2"); | |
1088 printf("\n"); | |
1089 #endif | |
1090 | |
1091 if (mm_flags & MM_MMX) { | |
1092 get_pixels = get_pixels_mmx; | |
324 | 1093 diff_pixels = diff_pixels_mmx; |
0 | 1094 put_pixels_clamped = put_pixels_clamped_mmx; |
1095 add_pixels_clamped = add_pixels_clamped_mmx; | |
296 | 1096 clear_blocks= clear_blocks_mmx; |
1097 | |
294 | 1098 pix_abs16x16 = pix_abs16x16_mmx; |
1099 pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
1100 pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
0 | 1101 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; |
294 | 1102 pix_abs8x8 = pix_abs8x8_mmx; |
1103 pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | |
1104 pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | |
1105 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | |
0 | 1106 av_fdct = fdct_mmx; |
1107 | |
1108 put_pixels_tab[0] = put_pixels_mmx; | |
1109 put_pixels_tab[1] = put_pixels_x2_mmx; | |
1110 put_pixels_tab[2] = put_pixels_y2_mmx; | |
1111 put_pixels_tab[3] = put_pixels_xy2_mmx; | |
1112 | |
1113 put_no_rnd_pixels_tab[0] = put_pixels_mmx; | |
1114 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
1115 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1116 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | |
1117 | |
1118 avg_pixels_tab[0] = avg_pixels_mmx; | |
1119 avg_pixels_tab[1] = avg_pixels_x2_mmx; | |
1120 avg_pixels_tab[2] = avg_pixels_y2_mmx; | |
1121 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1122 | |
1123 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | |
1124 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | |
1125 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | |
1126 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | |
1127 | |
1128 sub_pixels_tab[0] = sub_pixels_mmx; | |
1129 sub_pixels_tab[1] = sub_pixels_x2_mmx; | |
1130 sub_pixels_tab[2] = sub_pixels_y2_mmx; | |
1131 sub_pixels_tab[3] = sub_pixels_xy2_mmx; | |
1132 | |
1133 if (mm_flags & MM_MMXEXT) { | |
294 | 1134 pix_abs16x16 = pix_abs16x16_mmx2; |
1135 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; | |
1136 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; | |
1137 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2; | |
1138 | |
1139 pix_abs8x8 = pix_abs8x8_mmx2; | |
1140 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; | |
1141 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; | |
1142 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; | |
1143 | |
0 | 1144 put_pixels_tab[1] = put_pixels_x2_sse; |
1145 put_pixels_tab[2] = put_pixels_y2_sse; | |
1146 | |
1147 avg_pixels_tab[0] = avg_pixels_sse; | |
1148 avg_pixels_tab[1] = avg_pixels_x2_sse; | |
1149 avg_pixels_tab[2] = avg_pixels_y2_sse; | |
1150 avg_pixels_tab[3] = avg_pixels_xy2_sse; | |
1151 | |
1152 sub_pixels_tab[1] = sub_pixels_x2_sse; | |
1153 sub_pixels_tab[2] = sub_pixels_y2_sse; | |
1154 } else if (mm_flags & MM_3DNOW) { | |
1155 put_pixels_tab[1] = put_pixels_x2_3dnow; | |
1156 put_pixels_tab[2] = put_pixels_y2_3dnow; | |
1157 | |
1158 avg_pixels_tab[0] = avg_pixels_3dnow; | |
1159 avg_pixels_tab[1] = avg_pixels_x2_3dnow; | |
1160 avg_pixels_tab[2] = avg_pixels_y2_3dnow; | |
1161 avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | |
1162 | |
1163 sub_pixels_tab[1] = sub_pixels_x2_3dnow; | |
1164 sub_pixels_tab[2] = sub_pixels_y2_3dnow; | |
1165 } | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1166 |
42 | 1167 /* idct */ |
1168 if (mm_flags & MM_MMXEXT) { | |
1169 ff_idct = ff_mmxext_idct; | |
1170 } else { | |
1171 ff_idct = ff_mmx_idct; | |
1172 } | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1173 #ifdef SIMPLE_IDCT |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1174 // ff_idct = simple_idct; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1175 ff_idct = simple_idct_mmx; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1176 #endif |
0 | 1177 } |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1178 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1179 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1180 // for speed testing |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1181 get_pixels = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1182 put_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1183 add_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1184 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1185 pix_abs16x16 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1186 pix_abs16x16_x2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1187 pix_abs16x16_y2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1188 pix_abs16x16_xy2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1189 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1190 put_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1191 put_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1192 put_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1193 put_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1194 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1195 put_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1196 put_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1197 put_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1198 put_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1199 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1200 avg_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1201 avg_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1202 avg_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1203 avg_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1204 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1205 avg_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1206 avg_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1207 avg_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1208 avg_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1209 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1210 sub_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1211 sub_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1212 sub_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1213 sub_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1214 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1215 //av_fdct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1216 //ff_idct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1217 #endif |
0 | 1218 } |