Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 438:fe58fe638f9b libavcodec
* removed TESTCPU_MAIN - not needed for testing
author | kabi |
---|---|
date | Mon, 27 May 2002 11:32:01 +0000 |
parents | 718a22dc121f |
children | a5edef76dac6 |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
0 | 4 * |
429 | 5 * This library is free software; you can redistribute it and/or |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2 of the License, or (at your option) any later version. | |
0 | 9 * |
429 | 10 * This library is distributed in the hope that it will be useful, |
0 | 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 * Lesser General Public License for more details. | |
0 | 14 * |
429 | 15 * You should have received a copy of the GNU Lesser General Public |
16 * License along with this library; if not, write to the Free Software | |
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
0 | 18 * |
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 */ | |
21 | |
22 #include "../dsputil.h" | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
23 #include "../simple_idct.h" |
0 | 24 |
5 | 25 int mm_flags; /* multimedia extension flags */ |
26 | |
294 | 27 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx); |
28 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
29 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
30 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
31 | |
32 int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
33 int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
34 int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
35 int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
36 | |
37 int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
38 int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
39 int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
40 int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
41 | |
42 int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
43 int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
44 int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
45 int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
46 | |
42 | 47 /* external functions, from idct_mmx.c */ |
48 void ff_mmx_idct(DCTELEM *block); | |
49 void ff_mmxext_idct(DCTELEM *block); | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
50 |
0 | 51 /* pixel operations */ |
387 | 52 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; |
53 static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | |
54 static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL; | |
8 | 55 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; |
56 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |
0 | 57 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
58 #define JUMPALIGN() __asm __volatile (".balign 8"::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
59 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
60 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
61 #ifndef PIC |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
62 #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
63 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) |
387 | 64 #define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t" |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
65 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
66 // for shared library it's better to use this way for accessing constants |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
67 // pcmpeqd -> -1 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
68 #define MOVQ_WONE(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
69 __asm __volatile ( \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
70 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
71 "psrlw $15, %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
72 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
73 #define MOVQ_WTWO(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
74 __asm __volatile ( \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
75 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
76 "psrlw $15, %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
77 "psllw $1, %%" #regd ::) |
387 | 78 |
79 #define MOVQ_BONE(regd) \ | |
80 "pcmpeqd " #regd ", " #regd " \n\t" \ | |
81 "psrlw $15, " #regd " \n\t"\ | |
82 "packuswb " #regd ", " #regd " \n\t" | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
83 #endif |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
84 |
387 | 85 |
0 | 86 /***********************************/ |
87 /* 3Dnow specific */ | |
88 | |
89 #define DEF(x) x ## _3dnow | |
90 /* for Athlons PAVGUSB is prefered */ | |
91 #define PAVGB "pavgusb" | |
92 | |
93 #include "dsputil_mmx_avg.h" | |
94 | |
95 #undef DEF | |
96 #undef PAVGB | |
97 | |
98 /***********************************/ | |
99 /* MMX2 specific */ | |
100 | |
386 | 101 #define DEF(x) x ## _mmx2 |
0 | 102 |
103 /* Introduced only in MMX2 set */ | |
104 #define PAVGB "pavgb" | |
105 | |
106 #include "dsputil_mmx_avg.h" | |
107 | |
108 #undef DEF | |
109 #undef PAVGB | |
110 | |
111 /***********************************/ | |
112 /* standard MMX */ | |
113 | |
114 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
115 { | |
386 | 116 asm volatile( |
117 "movl $-128, %%eax \n\t" | |
118 "pxor %%mm7, %%mm7 \n\t" | |
119 ".balign 16 \n\t" | |
120 "1: \n\t" | |
121 "movq (%0), %%mm0 \n\t" | |
122 "movq (%0, %2), %%mm2 \n\t" | |
123 "movq %%mm0, %%mm1 \n\t" | |
124 "movq %%mm2, %%mm3 \n\t" | |
125 "punpcklbw %%mm7, %%mm0 \n\t" | |
126 "punpckhbw %%mm7, %%mm1 \n\t" | |
127 "punpcklbw %%mm7, %%mm2 \n\t" | |
128 "punpckhbw %%mm7, %%mm3 \n\t" | |
129 "movq %%mm0, (%1, %%eax)\n\t" | |
130 "movq %%mm1, 8(%1, %%eax)\n\t" | |
131 "movq %%mm2, 16(%1, %%eax)\n\t" | |
132 "movq %%mm3, 24(%1, %%eax)\n\t" | |
133 "addl %3, %0 \n\t" | |
134 "addl $32, %%eax \n\t" | |
135 "js 1b \n\t" | |
136 : "+r" (pixels) | |
137 : "r" (block+64), "r" (line_size), "r" (line_size*2) | |
138 : "%eax" | |
139 ); | |
0 | 140 } |
141 | |
324 | 142 static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) |
143 { | |
144 asm volatile( | |
386 | 145 "pxor %%mm7, %%mm7 \n\t" |
146 "movl $-128, %%eax \n\t" | |
324 | 147 ".balign 16 \n\t" |
148 "1: \n\t" | |
149 "movq (%0), %%mm0 \n\t" | |
150 "movq (%1), %%mm2 \n\t" | |
151 "movq %%mm0, %%mm1 \n\t" | |
152 "movq %%mm2, %%mm3 \n\t" | |
153 "punpcklbw %%mm7, %%mm0 \n\t" | |
154 "punpckhbw %%mm7, %%mm1 \n\t" | |
155 "punpcklbw %%mm7, %%mm2 \n\t" | |
156 "punpckhbw %%mm7, %%mm3 \n\t" | |
157 "psubw %%mm2, %%mm0 \n\t" | |
158 "psubw %%mm3, %%mm1 \n\t" | |
159 "movq %%mm0, (%2, %%eax)\n\t" | |
160 "movq %%mm1, 8(%2, %%eax)\n\t" | |
161 "addl %3, %0 \n\t" | |
162 "addl %3, %1 \n\t" | |
163 "addl $16, %%eax \n\t" | |
164 "jnz 1b \n\t" | |
165 : "+r" (s1), "+r" (s2) | |
166 : "r" (block+64), "r" (stride) | |
167 : "%eax" | |
168 ); | |
169 } | |
170 | |
0 | 171 static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) |
172 { | |
173 const DCTELEM *p; | |
174 UINT8 *pix; | |
175 | |
176 /* read the pixels */ | |
177 p = block; | |
178 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
179 /* unrolled loop */ |
0 | 180 __asm __volatile( |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
181 "movq %3, %%mm0\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
182 "movq 8%3, %%mm1\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
183 "movq 16%3, %%mm2\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
184 "movq 24%3, %%mm3\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
185 "movq 32%3, %%mm4\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
186 "movq 40%3, %%mm5\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
187 "movq 48%3, %%mm6\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
188 "movq 56%3, %%mm7\n\t" |
0 | 189 "packuswb %%mm1, %%mm0\n\t" |
190 "packuswb %%mm3, %%mm2\n\t" | |
191 "packuswb %%mm5, %%mm4\n\t" | |
192 "packuswb %%mm7, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
193 "movq %%mm0, (%0)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
194 "movq %%mm2, (%0, %1)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
195 "movq %%mm4, (%0, %1, 2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
196 "movq %%mm6, (%0, %2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
197 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) |
0 | 198 :"memory"); |
199 pix += line_size*4; | |
200 p += 32; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
201 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
202 // if here would be an exact copy of the code above |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
203 // compiler would generate some very strange code |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
204 // thus using "r" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
205 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
206 "movq (%3), %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
207 "movq 8(%3), %%mm1\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
208 "movq 16(%3), %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
209 "movq 24(%3), %%mm3\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
210 "movq 32(%3), %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
211 "movq 40(%3), %%mm5\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
212 "movq 48(%3), %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
213 "movq 56(%3), %%mm7\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
214 "packuswb %%mm1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
215 "packuswb %%mm3, %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
216 "packuswb %%mm5, %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
217 "packuswb %%mm7, %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
218 "movq %%mm0, (%0)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
219 "movq %%mm2, (%0, %1)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
220 "movq %%mm4, (%0, %1, 2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
221 "movq %%mm6, (%0, %2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
222 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
223 :"memory"); |
0 | 224 } |
225 | |
226 static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
227 { | |
228 const DCTELEM *p; | |
229 UINT8 *pix; | |
230 int i; | |
231 | |
232 /* read the pixels */ | |
233 p = block; | |
234 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
235 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
236 i = 4; |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
237 do { |
0 | 238 __asm __volatile( |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
239 "movq (%2), %%mm0\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
240 "movq 8(%2), %%mm1\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
241 "movq 16(%2), %%mm2\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
242 "movq 24(%2), %%mm3\n\t" |
0 | 243 "movq %0, %%mm4\n\t" |
244 "movq %1, %%mm6\n\t" | |
245 "movq %%mm4, %%mm5\n\t" | |
246 "punpcklbw %%mm7, %%mm4\n\t" | |
247 "punpckhbw %%mm7, %%mm5\n\t" | |
248 "paddsw %%mm4, %%mm0\n\t" | |
249 "paddsw %%mm5, %%mm1\n\t" | |
250 "movq %%mm6, %%mm5\n\t" | |
251 "punpcklbw %%mm7, %%mm6\n\t" | |
252 "punpckhbw %%mm7, %%mm5\n\t" | |
253 "paddsw %%mm6, %%mm2\n\t" | |
254 "paddsw %%mm5, %%mm3\n\t" | |
255 "packuswb %%mm1, %%mm0\n\t" | |
256 "packuswb %%mm3, %%mm2\n\t" | |
257 "movq %%mm0, %0\n\t" | |
258 "movq %%mm2, %1\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
259 :"+m"(*pix), "+m"(*(pix+line_size)) |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
260 :"r"(p) |
0 | 261 :"memory"); |
262 pix += line_size*2; | |
263 p += 16; | |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
264 } while (--i); |
0 | 265 } |
266 | |
267 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
268 { | |
420 | 269 asm volatile |
270 ( | |
271 "lea (%3, %3), %%eax \n\t" | |
422 | 272 ".balign 8 \n\t" |
420 | 273 "1: \n\t" |
274 "movq (%1), %%mm0 \n\t" | |
275 "movq (%1, %3), %%mm1 \n\t" | |
276 "movq %%mm0, (%2) \n\t" | |
277 "movq %%mm1, (%2, %3) \n\t" | |
278 "addl %%eax, %1 \n\t" | |
279 "addl %%eax, %2 \n\t" | |
280 "movq (%1), %%mm0 \n\t" | |
281 "movq (%1, %3), %%mm1 \n\t" | |
282 "movq %%mm0, (%2) \n\t" | |
283 "movq %%mm1, (%2, %3) \n\t" | |
284 "addl %%eax, %1 \n\t" | |
285 "addl %%eax, %2 \n\t" | |
286 "subl $4, %0 \n\t" | |
287 "jnz 1b \n\t" | |
288 : "+g"(h), "+r" (pixels), "+r" (block) | |
289 : "r"(line_size) | |
290 : "%eax", "memory" | |
291 ); | |
0 | 292 } |
293 | |
294 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
295 { | |
296 UINT8 *p; | |
297 const UINT8 *pix; | |
298 p = block; | |
299 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
300 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
301 MOVQ_WONE(mm4); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
302 JUMPALIGN(); |
0 | 303 do { |
304 __asm __volatile( | |
305 "movq %1, %%mm0\n\t" | |
306 "movq 1%1, %%mm1\n\t" | |
307 "movq %%mm0, %%mm2\n\t" | |
308 "movq %%mm1, %%mm3\n\t" | |
309 "punpcklbw %%mm7, %%mm0\n\t" | |
310 "punpcklbw %%mm7, %%mm1\n\t" | |
311 "punpckhbw %%mm7, %%mm2\n\t" | |
312 "punpckhbw %%mm7, %%mm3\n\t" | |
313 "paddusw %%mm1, %%mm0\n\t" | |
314 "paddusw %%mm3, %%mm2\n\t" | |
315 "paddusw %%mm4, %%mm0\n\t" | |
316 "paddusw %%mm4, %%mm2\n\t" | |
317 "psrlw $1, %%mm0\n\t" | |
318 "psrlw $1, %%mm2\n\t" | |
319 "packuswb %%mm2, %%mm0\n\t" | |
320 "movq %%mm0, %0\n\t" | |
321 :"=m"(*p) | |
322 :"m"(*pix) | |
323 :"memory"); | |
324 pix += line_size; p += line_size; | |
325 } while (--h); | |
326 } | |
327 | |
328 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
329 { | |
330 UINT8 *p; | |
331 const UINT8 *pix; | |
332 p = block; | |
333 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
334 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
335 MOVQ_WONE(mm4); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
336 JUMPALIGN(); |
0 | 337 do { |
338 __asm __volatile( | |
339 "movq %1, %%mm0\n\t" | |
340 "movq %2, %%mm1\n\t" | |
341 "movq %%mm0, %%mm2\n\t" | |
342 "movq %%mm1, %%mm3\n\t" | |
343 "punpcklbw %%mm7, %%mm0\n\t" | |
344 "punpcklbw %%mm7, %%mm1\n\t" | |
345 "punpckhbw %%mm7, %%mm2\n\t" | |
346 "punpckhbw %%mm7, %%mm3\n\t" | |
347 "paddusw %%mm1, %%mm0\n\t" | |
348 "paddusw %%mm3, %%mm2\n\t" | |
349 "paddusw %%mm4, %%mm0\n\t" | |
350 "paddusw %%mm4, %%mm2\n\t" | |
351 "psrlw $1, %%mm0\n\t" | |
352 "psrlw $1, %%mm2\n\t" | |
353 "packuswb %%mm2, %%mm0\n\t" | |
354 "movq %%mm0, %0\n\t" | |
355 :"=m"(*p) | |
356 :"m"(*pix), | |
357 "m"(*(pix+line_size)) | |
358 :"memory"); | |
359 pix += line_size; | |
360 p += line_size; | |
361 } while (--h); | |
362 } | |
363 | |
364 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
365 { | |
366 UINT8 *p; | |
367 const UINT8 *pix; | |
368 p = block; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
369 pix = pixels; // 1s |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
370 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
371 MOVQ_WTWO(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
372 JUMPALIGN(); |
0 | 373 do { |
374 __asm __volatile( | |
375 "movq %1, %%mm0\n\t" | |
376 "movq %2, %%mm1\n\t" | |
377 "movq 1%1, %%mm4\n\t" | |
378 "movq 1%2, %%mm5\n\t" | |
379 "movq %%mm0, %%mm2\n\t" | |
380 "movq %%mm1, %%mm3\n\t" | |
381 "punpcklbw %%mm7, %%mm0\n\t" | |
382 "punpcklbw %%mm7, %%mm1\n\t" | |
383 "punpckhbw %%mm7, %%mm2\n\t" | |
384 "punpckhbw %%mm7, %%mm3\n\t" | |
385 "paddusw %%mm1, %%mm0\n\t" | |
386 "paddusw %%mm3, %%mm2\n\t" | |
387 "movq %%mm4, %%mm1\n\t" | |
388 "movq %%mm5, %%mm3\n\t" | |
389 "punpcklbw %%mm7, %%mm4\n\t" | |
390 "punpcklbw %%mm7, %%mm5\n\t" | |
391 "punpckhbw %%mm7, %%mm1\n\t" | |
392 "punpckhbw %%mm7, %%mm3\n\t" | |
393 "paddusw %%mm5, %%mm4\n\t" | |
394 "paddusw %%mm3, %%mm1\n\t" | |
395 "paddusw %%mm6, %%mm4\n\t" | |
396 "paddusw %%mm6, %%mm1\n\t" | |
397 "paddusw %%mm4, %%mm0\n\t" | |
398 "paddusw %%mm1, %%mm2\n\t" | |
399 "psrlw $2, %%mm0\n\t" | |
400 "psrlw $2, %%mm2\n\t" | |
401 "packuswb %%mm2, %%mm0\n\t" | |
402 "movq %%mm0, %0\n\t" | |
403 :"=m"(*p) | |
404 :"m"(*pix), | |
405 "m"(*(pix+line_size)) | |
406 :"memory"); | |
407 pix += line_size; | |
408 p += line_size; | |
409 } while(--h); | |
410 } | |
411 | |
412 static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
413 { | |
414 UINT8 *p; | |
415 const UINT8 *pix; | |
416 p = block; | |
417 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
418 MOVQ_ZERO(mm7); |
0 | 419 do { |
420 __asm __volatile( | |
421 "movq %1, %%mm0\n\t" | |
422 "movq 1%1, %%mm1\n\t" | |
423 "movq %%mm0, %%mm2\n\t" | |
424 "movq %%mm1, %%mm3\n\t" | |
425 "punpcklbw %%mm7, %%mm0\n\t" | |
426 "punpcklbw %%mm7, %%mm1\n\t" | |
427 "punpckhbw %%mm7, %%mm2\n\t" | |
428 "punpckhbw %%mm7, %%mm3\n\t" | |
429 "paddusw %%mm1, %%mm0\n\t" | |
430 "paddusw %%mm3, %%mm2\n\t" | |
431 "psrlw $1, %%mm0\n\t" | |
432 "psrlw $1, %%mm2\n\t" | |
433 "packuswb %%mm2, %%mm0\n\t" | |
434 "movq %%mm0, %0\n\t" | |
435 :"=m"(*p) | |
436 :"m"(*pix) | |
437 :"memory"); | |
438 pix += line_size; | |
439 p += line_size; | |
440 } while (--h); | |
441 } | |
442 | |
443 static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
444 { | |
445 UINT8 *p; | |
446 const UINT8 *pix; | |
447 p = block; | |
448 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
449 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
450 JUMPALIGN(); |
0 | 451 do { |
452 __asm __volatile( | |
453 "movq %1, %%mm0\n\t" | |
454 "movq %2, %%mm1\n\t" | |
455 "movq %%mm0, %%mm2\n\t" | |
456 "movq %%mm1, %%mm3\n\t" | |
457 "punpcklbw %%mm7, %%mm0\n\t" | |
458 "punpcklbw %%mm7, %%mm1\n\t" | |
459 "punpckhbw %%mm7, %%mm2\n\t" | |
460 "punpckhbw %%mm7, %%mm3\n\t" | |
461 "paddusw %%mm1, %%mm0\n\t" | |
462 "paddusw %%mm3, %%mm2\n\t" | |
463 "psrlw $1, %%mm0\n\t" | |
464 "psrlw $1, %%mm2\n\t" | |
465 "packuswb %%mm2, %%mm0\n\t" | |
466 "movq %%mm0, %0\n\t" | |
467 :"=m"(*p) | |
468 :"m"(*pix), | |
469 "m"(*(pix+line_size)) | |
470 :"memory"); | |
471 pix += line_size; | |
472 p += line_size; | |
473 } while(--h); | |
474 } | |
475 | |
476 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
477 { | |
478 UINT8 *p; | |
479 const UINT8 *pix; | |
480 p = block; | |
481 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
482 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
483 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
484 JUMPALIGN(); |
0 | 485 do { |
486 __asm __volatile( | |
487 "movq %1, %%mm0\n\t" | |
488 "movq %2, %%mm1\n\t" | |
489 "movq 1%1, %%mm4\n\t" | |
490 "movq 1%2, %%mm5\n\t" | |
491 "movq %%mm0, %%mm2\n\t" | |
492 "movq %%mm1, %%mm3\n\t" | |
493 "punpcklbw %%mm7, %%mm0\n\t" | |
494 "punpcklbw %%mm7, %%mm1\n\t" | |
495 "punpckhbw %%mm7, %%mm2\n\t" | |
496 "punpckhbw %%mm7, %%mm3\n\t" | |
497 "paddusw %%mm1, %%mm0\n\t" | |
498 "paddusw %%mm3, %%mm2\n\t" | |
499 "movq %%mm4, %%mm1\n\t" | |
500 "movq %%mm5, %%mm3\n\t" | |
501 "punpcklbw %%mm7, %%mm4\n\t" | |
502 "punpcklbw %%mm7, %%mm5\n\t" | |
503 "punpckhbw %%mm7, %%mm1\n\t" | |
504 "punpckhbw %%mm7, %%mm3\n\t" | |
505 "paddusw %%mm5, %%mm4\n\t" | |
506 "paddusw %%mm3, %%mm1\n\t" | |
507 "paddusw %%mm6, %%mm4\n\t" | |
508 "paddusw %%mm6, %%mm1\n\t" | |
509 "paddusw %%mm4, %%mm0\n\t" | |
510 "paddusw %%mm1, %%mm2\n\t" | |
511 "psrlw $2, %%mm0\n\t" | |
512 "psrlw $2, %%mm2\n\t" | |
513 "packuswb %%mm2, %%mm0\n\t" | |
514 "movq %%mm0, %0\n\t" | |
515 :"=m"(*p) | |
516 :"m"(*pix), | |
517 "m"(*(pix+line_size)) | |
518 :"memory"); | |
519 pix += line_size; | |
520 p += line_size; | |
521 } while(--h); | |
522 } | |
523 | |
524 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
525 { | |
526 UINT8 *p; | |
527 const UINT8 *pix; | |
528 p = block; | |
529 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
530 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
531 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
532 JUMPALIGN(); |
0 | 533 do { |
534 __asm __volatile( | |
535 "movq %0, %%mm0\n\t" | |
536 "movq %1, %%mm1\n\t" | |
537 "movq %%mm0, %%mm2\n\t" | |
538 "movq %%mm1, %%mm3\n\t" | |
539 "punpcklbw %%mm7, %%mm0\n\t" | |
540 "punpcklbw %%mm7, %%mm1\n\t" | |
541 "punpckhbw %%mm7, %%mm2\n\t" | |
542 "punpckhbw %%mm7, %%mm3\n\t" | |
543 "paddusw %%mm1, %%mm0\n\t" | |
544 "paddusw %%mm3, %%mm2\n\t" | |
545 "paddusw %%mm6, %%mm0\n\t" | |
546 "paddusw %%mm6, %%mm2\n\t" | |
547 "psrlw $1, %%mm0\n\t" | |
548 "psrlw $1, %%mm2\n\t" | |
549 "packuswb %%mm2, %%mm0\n\t" | |
550 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
551 :"+m"(*p) |
0 | 552 :"m"(*pix) |
553 :"memory"); | |
554 pix += line_size; | |
555 p += line_size; | |
556 } | |
557 while (--h); | |
558 } | |
559 | |
560 static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
561 { | |
562 UINT8 *p; | |
563 const UINT8 *pix; | |
564 p = block; | |
565 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
566 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
567 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
568 JUMPALIGN(); |
0 | 569 do { |
570 __asm __volatile( | |
571 "movq %1, %%mm1\n\t" | |
572 "movq %0, %%mm0\n\t" | |
573 "movq 1%1, %%mm4\n\t" | |
574 "movq %%mm0, %%mm2\n\t" | |
575 "movq %%mm1, %%mm3\n\t" | |
576 "movq %%mm4, %%mm5\n\t" | |
577 "punpcklbw %%mm7, %%mm1\n\t" | |
578 "punpckhbw %%mm7, %%mm3\n\t" | |
579 "punpcklbw %%mm7, %%mm4\n\t" | |
580 "punpckhbw %%mm7, %%mm5\n\t" | |
581 "punpcklbw %%mm7, %%mm0\n\t" | |
582 "punpckhbw %%mm7, %%mm2\n\t" | |
583 "paddusw %%mm4, %%mm1\n\t" | |
584 "paddusw %%mm5, %%mm3\n\t" | |
585 "paddusw %%mm6, %%mm1\n\t" | |
586 "paddusw %%mm6, %%mm3\n\t" | |
587 "psrlw $1, %%mm1\n\t" | |
588 "psrlw $1, %%mm3\n\t" | |
589 "paddusw %%mm6, %%mm0\n\t" | |
590 "paddusw %%mm6, %%mm2\n\t" | |
591 "paddusw %%mm1, %%mm0\n\t" | |
592 "paddusw %%mm3, %%mm2\n\t" | |
593 "psrlw $1, %%mm0\n\t" | |
594 "psrlw $1, %%mm2\n\t" | |
595 "packuswb %%mm2, %%mm0\n\t" | |
596 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
597 :"+m"(*p) |
0 | 598 :"m"(*pix) |
599 :"memory"); | |
600 pix += line_size; | |
601 p += line_size; | |
602 } while (--h); | |
603 } | |
604 | |
605 static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
606 { | |
607 UINT8 *p; | |
608 const UINT8 *pix; | |
609 p = block; | |
610 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
611 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
612 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
613 JUMPALIGN(); |
0 | 614 do { |
615 __asm __volatile( | |
616 "movq %1, %%mm1\n\t" | |
617 "movq %0, %%mm0\n\t" | |
618 "movq %2, %%mm4\n\t" | |
619 "movq %%mm0, %%mm2\n\t" | |
620 "movq %%mm1, %%mm3\n\t" | |
621 "movq %%mm4, %%mm5\n\t" | |
622 "punpcklbw %%mm7, %%mm1\n\t" | |
623 "punpckhbw %%mm7, %%mm3\n\t" | |
624 "punpcklbw %%mm7, %%mm4\n\t" | |
625 "punpckhbw %%mm7, %%mm5\n\t" | |
626 "punpcklbw %%mm7, %%mm0\n\t" | |
627 "punpckhbw %%mm7, %%mm2\n\t" | |
628 "paddusw %%mm4, %%mm1\n\t" | |
629 "paddusw %%mm5, %%mm3\n\t" | |
630 "paddusw %%mm6, %%mm1\n\t" | |
631 "paddusw %%mm6, %%mm3\n\t" | |
632 "psrlw $1, %%mm1\n\t" | |
633 "psrlw $1, %%mm3\n\t" | |
634 "paddusw %%mm6, %%mm0\n\t" | |
635 "paddusw %%mm6, %%mm2\n\t" | |
636 "paddusw %%mm1, %%mm0\n\t" | |
637 "paddusw %%mm3, %%mm2\n\t" | |
638 "psrlw $1, %%mm0\n\t" | |
639 "psrlw $1, %%mm2\n\t" | |
640 "packuswb %%mm2, %%mm0\n\t" | |
641 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
642 :"+m"(*p) |
0 | 643 :"m"(*pix), "m"(*(pix+line_size)) |
644 :"memory"); | |
645 pix += line_size; | |
646 p += line_size ; | |
647 } while(--h); | |
648 } | |
649 | |
650 static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
651 { | |
652 UINT8 *p; | |
653 const UINT8 *pix; | |
654 p = block; | |
655 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
656 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
657 // this doesn't seem to be used offten - so |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
658 // the inside usage of mm_wone is not optimized |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
659 MOVQ_WTWO(mm6); |
0 | 660 do { |
661 __asm __volatile( | |
662 "movq %1, %%mm0\n\t" | |
663 "movq %2, %%mm1\n\t" | |
664 "movq 1%1, %%mm4\n\t" | |
665 "movq 1%2, %%mm5\n\t" | |
666 "movq %%mm0, %%mm2\n\t" | |
667 "movq %%mm1, %%mm3\n\t" | |
668 "punpcklbw %%mm7, %%mm0\n\t" | |
669 "punpcklbw %%mm7, %%mm1\n\t" | |
670 "punpckhbw %%mm7, %%mm2\n\t" | |
671 "punpckhbw %%mm7, %%mm3\n\t" | |
672 "paddusw %%mm1, %%mm0\n\t" | |
673 "paddusw %%mm3, %%mm2\n\t" | |
674 "movq %%mm4, %%mm1\n\t" | |
675 "movq %%mm5, %%mm3\n\t" | |
676 "punpcklbw %%mm7, %%mm4\n\t" | |
677 "punpcklbw %%mm7, %%mm5\n\t" | |
678 "punpckhbw %%mm7, %%mm1\n\t" | |
679 "punpckhbw %%mm7, %%mm3\n\t" | |
680 "paddusw %%mm5, %%mm4\n\t" | |
681 "paddusw %%mm3, %%mm1\n\t" | |
682 "paddusw %%mm6, %%mm4\n\t" | |
683 "paddusw %%mm6, %%mm1\n\t" | |
684 "paddusw %%mm4, %%mm0\n\t" | |
685 "paddusw %%mm1, %%mm2\n\t" | |
686 "movq %3, %%mm5\n\t" | |
687 "psrlw $2, %%mm0\n\t" | |
688 "movq %0, %%mm1\n\t" | |
689 "psrlw $2, %%mm2\n\t" | |
690 "movq %%mm1, %%mm3\n\t" | |
691 "punpcklbw %%mm7, %%mm1\n\t" | |
692 "punpckhbw %%mm7, %%mm3\n\t" | |
693 "paddusw %%mm1, %%mm0\n\t" | |
694 "paddusw %%mm3, %%mm2\n\t" | |
695 "paddusw %%mm5, %%mm0\n\t" | |
696 "paddusw %%mm5, %%mm2\n\t" | |
697 "psrlw $1, %%mm0\n\t" | |
698 "psrlw $1, %%mm2\n\t" | |
699 "packuswb %%mm2, %%mm0\n\t" | |
700 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
701 :"+m"(*p) |
0 | 702 :"m"(*pix), |
8 | 703 "m"(*(pix+line_size)), "m"(mm_wone) |
0 | 704 :"memory"); |
705 pix += line_size; | |
706 p += line_size ; | |
707 } while(--h); | |
708 } | |
709 | |
710 static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
711 { | |
712 UINT8 *p; | |
713 const UINT8 *pix; | |
714 p = block; | |
715 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
716 MOVQ_ZERO(mm7); |
0 | 717 do { |
718 __asm __volatile( | |
719 "movq %1, %%mm0\n\t" | |
720 "movq %0, %%mm1\n\t" | |
721 "movq %%mm0, %%mm2\n\t" | |
722 "movq %%mm1, %%mm3\n\t" | |
723 "punpcklbw %%mm7, %%mm0\n\t" | |
724 "punpcklbw %%mm7, %%mm1\n\t" | |
725 "punpckhbw %%mm7, %%mm2\n\t" | |
726 "punpckhbw %%mm7, %%mm3\n\t" | |
727 "paddusw %%mm1, %%mm0\n\t" | |
728 "paddusw %%mm3, %%mm2\n\t" | |
729 "psrlw $1, %%mm0\n\t" | |
730 "psrlw $1, %%mm2\n\t" | |
731 "packuswb %%mm2, %%mm0\n\t" | |
732 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
733 :"+m"(*p) |
0 | 734 :"m"(*pix) |
735 :"memory"); | |
736 pix += line_size; | |
737 p += line_size ; | |
738 } while (--h); | |
739 } | |
740 | |
741 static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
742 { | |
743 UINT8 *p; | |
744 const UINT8 *pix; | |
745 p = block; | |
746 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
747 MOVQ_ZERO(mm7); |
0 | 748 do { |
749 __asm __volatile( | |
750 "movq %1, %%mm0\n\t" | |
751 "movq 1%1, %%mm1\n\t" | |
752 "movq %0, %%mm4\n\t" | |
753 "movq %%mm0, %%mm2\n\t" | |
754 "movq %%mm1, %%mm3\n\t" | |
755 "movq %%mm4, %%mm5\n\t" | |
756 "punpcklbw %%mm7, %%mm0\n\t" | |
757 "punpcklbw %%mm7, %%mm1\n\t" | |
758 "punpckhbw %%mm7, %%mm2\n\t" | |
759 "punpckhbw %%mm7, %%mm3\n\t" | |
760 "punpcklbw %%mm7, %%mm4\n\t" | |
761 "punpckhbw %%mm7, %%mm5\n\t" | |
762 "paddusw %%mm1, %%mm0\n\t" | |
763 "paddusw %%mm3, %%mm2\n\t" | |
764 "psrlw $1, %%mm0\n\t" | |
765 "psrlw $1, %%mm2\n\t" | |
766 "paddusw %%mm4, %%mm0\n\t" | |
767 "paddusw %%mm5, %%mm2\n\t" | |
768 "psrlw $1, %%mm0\n\t" | |
769 "psrlw $1, %%mm2\n\t" | |
770 "packuswb %%mm2, %%mm0\n\t" | |
771 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
772 :"+m"(*p) |
0 | 773 :"m"(*pix) |
774 :"memory"); | |
775 pix += line_size; | |
776 p += line_size; | |
777 } while (--h); | |
778 } | |
779 | |
780 static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
781 { | |
782 UINT8 *p; | |
783 const UINT8 *pix; | |
784 p = block; | |
785 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
786 MOVQ_ZERO(mm7); |
0 | 787 do { |
788 __asm __volatile( | |
789 "movq %1, %%mm0\n\t" | |
790 "movq %2, %%mm1\n\t" | |
791 "movq %0, %%mm4\n\t" | |
792 "movq %%mm0, %%mm2\n\t" | |
793 "movq %%mm1, %%mm3\n\t" | |
794 "movq %%mm4, %%mm5\n\t" | |
795 "punpcklbw %%mm7, %%mm0\n\t" | |
796 "punpcklbw %%mm7, %%mm1\n\t" | |
797 "punpckhbw %%mm7, %%mm2\n\t" | |
798 "punpckhbw %%mm7, %%mm3\n\t" | |
799 "punpcklbw %%mm7, %%mm4\n\t" | |
800 "punpckhbw %%mm7, %%mm5\n\t" | |
801 "paddusw %%mm1, %%mm0\n\t" | |
802 "paddusw %%mm3, %%mm2\n\t" | |
803 "psrlw $1, %%mm0\n\t" | |
804 "psrlw $1, %%mm2\n\t" | |
805 "paddusw %%mm4, %%mm0\n\t" | |
806 "paddusw %%mm5, %%mm2\n\t" | |
807 "psrlw $1, %%mm0\n\t" | |
808 "psrlw $1, %%mm2\n\t" | |
809 "packuswb %%mm2, %%mm0\n\t" | |
810 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
811 :"+m"(*p) |
0 | 812 :"m"(*pix), "m"(*(pix+line_size)) |
813 :"memory"); | |
814 pix += line_size; | |
815 p += line_size ; | |
816 } while(--h); | |
817 } | |
818 | |
819 static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
820 { | |
821 UINT8 *p; | |
822 const UINT8 *pix; | |
823 p = block; | |
824 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
825 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
826 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
827 JUMPALIGN(); |
0 | 828 do { |
829 __asm __volatile( | |
830 "movq %1, %%mm0\n\t" | |
831 "movq %2, %%mm1\n\t" | |
832 "movq 1%1, %%mm4\n\t" | |
833 "movq 1%2, %%mm5\n\t" | |
834 "movq %%mm0, %%mm2\n\t" | |
835 "movq %%mm1, %%mm3\n\t" | |
836 "punpcklbw %%mm7, %%mm0\n\t" | |
837 "punpcklbw %%mm7, %%mm1\n\t" | |
838 "punpckhbw %%mm7, %%mm2\n\t" | |
839 "punpckhbw %%mm7, %%mm3\n\t" | |
840 "paddusw %%mm1, %%mm0\n\t" | |
841 "paddusw %%mm3, %%mm2\n\t" | |
842 "movq %%mm4, %%mm1\n\t" | |
843 "movq %%mm5, %%mm3\n\t" | |
844 "punpcklbw %%mm7, %%mm4\n\t" | |
845 "punpcklbw %%mm7, %%mm5\n\t" | |
846 "punpckhbw %%mm7, %%mm1\n\t" | |
847 "punpckhbw %%mm7, %%mm3\n\t" | |
848 "paddusw %%mm5, %%mm4\n\t" | |
849 "paddusw %%mm3, %%mm1\n\t" | |
850 "paddusw %%mm6, %%mm4\n\t" | |
851 "paddusw %%mm6, %%mm1\n\t" | |
852 "paddusw %%mm4, %%mm0\n\t" | |
853 "paddusw %%mm1, %%mm2\n\t" | |
854 "movq %0, %%mm1\n\t" | |
855 "psrlw $2, %%mm0\n\t" | |
856 "movq %%mm1, %%mm3\n\t" | |
857 "psrlw $2, %%mm2\n\t" | |
858 "punpcklbw %%mm7, %%mm1\n\t" | |
859 "punpckhbw %%mm7, %%mm3\n\t" | |
860 "paddusw %%mm1, %%mm0\n\t" | |
861 "paddusw %%mm3, %%mm2\n\t" | |
862 "psrlw $1, %%mm0\n\t" | |
863 "psrlw $1, %%mm2\n\t" | |
864 "packuswb %%mm2, %%mm0\n\t" | |
865 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
866 :"+m"(*p) |
0 | 867 :"m"(*pix), |
868 "m"(*(pix+line_size)) | |
869 :"memory"); | |
870 pix += line_size; | |
871 p += line_size; | |
872 } while(--h); | |
873 } | |
874 | |
296 | 875 static void clear_blocks_mmx(DCTELEM *blocks) |
876 { | |
877 asm volatile( | |
878 "pxor %%mm7, %%mm7 \n\t" | |
879 "movl $-128*6, %%eax \n\t" | |
880 "1: \n\t" | |
881 "movq %%mm7, (%0, %%eax) \n\t" | |
882 "movq %%mm7, 8(%0, %%eax) \n\t" | |
883 "movq %%mm7, 16(%0, %%eax) \n\t" | |
884 "movq %%mm7, 24(%0, %%eax) \n\t" | |
885 "addl $32, %%eax \n\t" | |
886 " js 1b \n\t" | |
887 : : "r" (((int)blocks)+128*6) | |
888 : "%eax" | |
889 ); | |
890 } | |
891 | |
393 | 892 #if 0 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
893 static void just_return() { return; } |
393 | 894 #endif |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
895 |
0 | 896 void dsputil_init_mmx(void) |
897 { | |
898 mm_flags = mm_support(); | |
188 | 899 #if 1 |
900 printf("libavcodec: CPU flags:"); | |
0 | 901 if (mm_flags & MM_MMX) |
902 printf(" mmx"); | |
903 if (mm_flags & MM_MMXEXT) | |
904 printf(" mmxext"); | |
905 if (mm_flags & MM_3DNOW) | |
906 printf(" 3dnow"); | |
907 if (mm_flags & MM_SSE) | |
908 printf(" sse"); | |
909 if (mm_flags & MM_SSE2) | |
910 printf(" sse2"); | |
911 printf("\n"); | |
912 #endif | |
913 | |
914 if (mm_flags & MM_MMX) { | |
915 get_pixels = get_pixels_mmx; | |
324 | 916 diff_pixels = diff_pixels_mmx; |
0 | 917 put_pixels_clamped = put_pixels_clamped_mmx; |
918 add_pixels_clamped = add_pixels_clamped_mmx; | |
296 | 919 clear_blocks= clear_blocks_mmx; |
415 | 920 |
294 | 921 pix_abs16x16 = pix_abs16x16_mmx; |
922 pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
923 pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
0 | 924 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; |
294 | 925 pix_abs8x8 = pix_abs8x8_mmx; |
926 pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | |
927 pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | |
928 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | |
0 | 929 av_fdct = fdct_mmx; |
930 | |
931 put_pixels_tab[0] = put_pixels_mmx; | |
932 put_pixels_tab[1] = put_pixels_x2_mmx; | |
933 put_pixels_tab[2] = put_pixels_y2_mmx; | |
934 put_pixels_tab[3] = put_pixels_xy2_mmx; | |
935 | |
936 put_no_rnd_pixels_tab[0] = put_pixels_mmx; | |
937 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
938 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
939 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | |
415 | 940 |
0 | 941 avg_pixels_tab[0] = avg_pixels_mmx; |
942 avg_pixels_tab[1] = avg_pixels_x2_mmx; | |
943 avg_pixels_tab[2] = avg_pixels_y2_mmx; | |
944 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
945 | |
946 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | |
947 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | |
948 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | |
949 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | |
386 | 950 |
0 | 951 if (mm_flags & MM_MMXEXT) { |
294 | 952 pix_abs16x16 = pix_abs16x16_mmx2; |
953 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; | |
954 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; | |
955 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2; | |
415 | 956 |
294 | 957 pix_abs8x8 = pix_abs8x8_mmx2; |
958 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; | |
959 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; | |
960 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; | |
386 | 961 |
962 put_pixels_tab[1] = put_pixels_x2_mmx2; | |
963 put_pixels_tab[2] = put_pixels_y2_mmx2; | |
964 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2; | |
965 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2; | |
415 | 966 |
386 | 967 avg_pixels_tab[0] = avg_pixels_mmx2; |
968 avg_pixels_tab[1] = avg_pixels_x2_mmx2; | |
969 avg_pixels_tab[2] = avg_pixels_y2_mmx2; | |
970 avg_pixels_tab[3] = avg_pixels_xy2_mmx2; | |
0 | 971 } else if (mm_flags & MM_3DNOW) { |
972 put_pixels_tab[1] = put_pixels_x2_3dnow; | |
973 put_pixels_tab[2] = put_pixels_y2_3dnow; | |
386 | 974 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow; |
975 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow; | |
393 | 976 |
0 | 977 avg_pixels_tab[0] = avg_pixels_3dnow; |
978 avg_pixels_tab[1] = avg_pixels_x2_3dnow; | |
979 avg_pixels_tab[2] = avg_pixels_y2_3dnow; | |
980 avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | |
981 } | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
982 |
42 | 983 /* idct */ |
984 if (mm_flags & MM_MMXEXT) { | |
985 ff_idct = ff_mmxext_idct; | |
986 } else { | |
987 ff_idct = ff_mmx_idct; | |
988 } | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
989 #ifdef SIMPLE_IDCT |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
990 // ff_idct = simple_idct; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
991 ff_idct = simple_idct_mmx; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
992 #endif |
0 | 993 } |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
994 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
995 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
996 // for speed testing |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
997 get_pixels = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
998 put_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
999 add_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1000 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1001 pix_abs16x16 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1002 pix_abs16x16_x2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1003 pix_abs16x16_y2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1004 pix_abs16x16_xy2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1005 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1006 put_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1007 put_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1008 put_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1009 put_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1010 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1011 put_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1012 put_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1013 put_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1014 put_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1015 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1016 avg_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1017 avg_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1018 avg_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1019 avg_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1020 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1021 avg_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1022 avg_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1023 avg_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1024 avg_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1025 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1026 //av_fdct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1027 //ff_idct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1028 #endif |
0 | 1029 } |
402 | 1030 |
1031 /* remove any non bit exact operation (testing purpose). NOTE that | |
1032 this function should be kept as small as possible because it is | |
1033 always difficult to test automatically non bit exact cases. */ | |
1034 void dsputil_set_bit_exact_mmx(void) | |
1035 { | |
1036 if (mm_flags & MM_MMX) { | |
1037 if (mm_flags & MM_MMXEXT) { | |
1038 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
1039 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1040 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1041 } else if (mm_flags & MM_3DNOW) { | |
1042 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
1043 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1044 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1045 } | |
1046 } | |
1047 } |