Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 417:040d86058c4a libavcodec
* align the source initialy address
author | kabi |
---|---|
date | Thu, 23 May 2002 12:17:08 +0000 |
parents | 1c3f42442fba |
children | b2fb2081dab5 |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
3 * Copyright (c) 2000, 2001 Gerard Lantau. | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
18 * | |
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 */ | |
21 | |
22 #include "../dsputil.h" | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
23 #include "../simple_idct.h" |
386 | 24 #include "../mangle.h" |
0 | 25 |
5 | 26 int mm_flags; /* multimedia extension flags */ |
27 | |
294 | 28 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx); |
29 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
30 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
31 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
32 | |
33 int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
34 int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
35 int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
36 int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
37 | |
38 int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
39 int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
40 int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
41 int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
42 | |
43 int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
44 int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
45 int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
46 int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
47 | |
42 | 48 /* external functions, from idct_mmx.c */ |
49 void ff_mmx_idct(DCTELEM *block); | |
50 void ff_mmxext_idct(DCTELEM *block); | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
51 |
0 | 52 /* pixel operations */ |
387 | 53 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; |
54 static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | |
55 static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL; | |
8 | 56 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; |
57 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |
0 | 58 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
59 #define JUMPALIGN() __asm __volatile (".balign 8"::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
60 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
61 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
62 #ifndef PIC |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
63 #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
64 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) |
387 | 65 #define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t" |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
66 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
67 // for shared library it's better to use this way for accessing constants |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
68 // pcmpeqd -> -1 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
69 #define MOVQ_WONE(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
70 __asm __volatile ( \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
71 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
72 "psrlw $15, %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
73 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
74 #define MOVQ_WTWO(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
75 __asm __volatile ( \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
76 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
77 "psrlw $15, %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
78 "psllw $1, %%" #regd ::) |
387 | 79 |
80 #define MOVQ_BONE(regd) \ | |
81 "pcmpeqd " #regd ", " #regd " \n\t" \ | |
82 "psrlw $15, " #regd " \n\t"\ | |
83 "packuswb " #regd ", " #regd " \n\t" | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
84 #endif |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
85 |
387 | 86 |
0 | 87 /***********************************/ |
88 /* 3Dnow specific */ | |
89 | |
90 #define DEF(x) x ## _3dnow | |
91 /* for Athlons PAVGUSB is prefered */ | |
92 #define PAVGB "pavgusb" | |
93 | |
94 #include "dsputil_mmx_avg.h" | |
95 | |
96 #undef DEF | |
97 #undef PAVGB | |
98 | |
99 /***********************************/ | |
100 /* MMX2 specific */ | |
101 | |
386 | 102 #define DEF(x) x ## _mmx2 |
0 | 103 |
104 /* Introduced only in MMX2 set */ | |
105 #define PAVGB "pavgb" | |
106 | |
107 #include "dsputil_mmx_avg.h" | |
108 | |
109 #undef DEF | |
110 #undef PAVGB | |
111 | |
112 /***********************************/ | |
113 /* standard MMX */ | |
114 | |
115 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
116 { | |
386 | 117 asm volatile( |
118 "movl $-128, %%eax \n\t" | |
119 "pxor %%mm7, %%mm7 \n\t" | |
120 ".balign 16 \n\t" | |
121 "1: \n\t" | |
122 "movq (%0), %%mm0 \n\t" | |
123 "movq (%0, %2), %%mm2 \n\t" | |
124 "movq %%mm0, %%mm1 \n\t" | |
125 "movq %%mm2, %%mm3 \n\t" | |
126 "punpcklbw %%mm7, %%mm0 \n\t" | |
127 "punpckhbw %%mm7, %%mm1 \n\t" | |
128 "punpcklbw %%mm7, %%mm2 \n\t" | |
129 "punpckhbw %%mm7, %%mm3 \n\t" | |
130 "movq %%mm0, (%1, %%eax)\n\t" | |
131 "movq %%mm1, 8(%1, %%eax)\n\t" | |
132 "movq %%mm2, 16(%1, %%eax)\n\t" | |
133 "movq %%mm3, 24(%1, %%eax)\n\t" | |
134 "addl %3, %0 \n\t" | |
135 "addl $32, %%eax \n\t" | |
136 "js 1b \n\t" | |
137 : "+r" (pixels) | |
138 : "r" (block+64), "r" (line_size), "r" (line_size*2) | |
139 : "%eax" | |
140 ); | |
0 | 141 } |
142 | |
324 | 143 static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) |
144 { | |
145 asm volatile( | |
386 | 146 "pxor %%mm7, %%mm7 \n\t" |
147 "movl $-128, %%eax \n\t" | |
324 | 148 ".balign 16 \n\t" |
149 "1: \n\t" | |
150 "movq (%0), %%mm0 \n\t" | |
151 "movq (%1), %%mm2 \n\t" | |
152 "movq %%mm0, %%mm1 \n\t" | |
153 "movq %%mm2, %%mm3 \n\t" | |
154 "punpcklbw %%mm7, %%mm0 \n\t" | |
155 "punpckhbw %%mm7, %%mm1 \n\t" | |
156 "punpcklbw %%mm7, %%mm2 \n\t" | |
157 "punpckhbw %%mm7, %%mm3 \n\t" | |
158 "psubw %%mm2, %%mm0 \n\t" | |
159 "psubw %%mm3, %%mm1 \n\t" | |
160 "movq %%mm0, (%2, %%eax)\n\t" | |
161 "movq %%mm1, 8(%2, %%eax)\n\t" | |
162 "addl %3, %0 \n\t" | |
163 "addl %3, %1 \n\t" | |
164 "addl $16, %%eax \n\t" | |
165 "jnz 1b \n\t" | |
166 : "+r" (s1), "+r" (s2) | |
167 : "r" (block+64), "r" (stride) | |
168 : "%eax" | |
169 ); | |
170 } | |
171 | |
0 | 172 static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) |
173 { | |
174 const DCTELEM *p; | |
175 UINT8 *pix; | |
176 | |
177 /* read the pixels */ | |
178 p = block; | |
179 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
180 /* unrolled loop */ |
0 | 181 __asm __volatile( |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
182 "movq %3, %%mm0\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
183 "movq 8%3, %%mm1\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
184 "movq 16%3, %%mm2\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
185 "movq 24%3, %%mm3\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
186 "movq 32%3, %%mm4\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
187 "movq 40%3, %%mm5\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
188 "movq 48%3, %%mm6\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
189 "movq 56%3, %%mm7\n\t" |
0 | 190 "packuswb %%mm1, %%mm0\n\t" |
191 "packuswb %%mm3, %%mm2\n\t" | |
192 "packuswb %%mm5, %%mm4\n\t" | |
193 "packuswb %%mm7, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
194 "movq %%mm0, (%0)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
195 "movq %%mm2, (%0, %1)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
196 "movq %%mm4, (%0, %1, 2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
197 "movq %%mm6, (%0, %2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
198 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) |
0 | 199 :"memory"); |
200 pix += line_size*4; | |
201 p += 32; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
202 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
203 // if here would be an exact copy of the code above |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
204 // compiler would generate some very strange code |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
205 // thus using "r" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
206 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
207 "movq (%3), %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
208 "movq 8(%3), %%mm1\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
209 "movq 16(%3), %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
210 "movq 24(%3), %%mm3\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
211 "movq 32(%3), %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
212 "movq 40(%3), %%mm5\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
213 "movq 48(%3), %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
214 "movq 56(%3), %%mm7\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
215 "packuswb %%mm1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
216 "packuswb %%mm3, %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
217 "packuswb %%mm5, %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
218 "packuswb %%mm7, %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
219 "movq %%mm0, (%0)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
220 "movq %%mm2, (%0, %1)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
221 "movq %%mm4, (%0, %1, 2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
222 "movq %%mm6, (%0, %2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
223 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
224 :"memory"); |
0 | 225 } |
226 | |
227 static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
228 { | |
229 const DCTELEM *p; | |
230 UINT8 *pix; | |
231 int i; | |
232 | |
233 /* read the pixels */ | |
234 p = block; | |
235 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
236 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
237 i = 4; |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
238 do { |
0 | 239 __asm __volatile( |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
240 "movq (%2), %%mm0\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
241 "movq 8(%2), %%mm1\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
242 "movq 16(%2), %%mm2\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
243 "movq 24(%2), %%mm3\n\t" |
0 | 244 "movq %0, %%mm4\n\t" |
245 "movq %1, %%mm6\n\t" | |
246 "movq %%mm4, %%mm5\n\t" | |
247 "punpcklbw %%mm7, %%mm4\n\t" | |
248 "punpckhbw %%mm7, %%mm5\n\t" | |
249 "paddsw %%mm4, %%mm0\n\t" | |
250 "paddsw %%mm5, %%mm1\n\t" | |
251 "movq %%mm6, %%mm5\n\t" | |
252 "punpcklbw %%mm7, %%mm6\n\t" | |
253 "punpckhbw %%mm7, %%mm5\n\t" | |
254 "paddsw %%mm6, %%mm2\n\t" | |
255 "paddsw %%mm5, %%mm3\n\t" | |
256 "packuswb %%mm1, %%mm0\n\t" | |
257 "packuswb %%mm3, %%mm2\n\t" | |
258 "movq %%mm0, %0\n\t" | |
259 "movq %%mm2, %1\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
260 :"+m"(*pix), "+m"(*(pix+line_size)) |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
261 :"r"(p) |
0 | 262 :"memory"); |
263 pix += line_size*2; | |
264 p += 16; | |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
265 } while (--i); |
0 | 266 } |
267 | |
268 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
269 { | |
386 | 270 #if 0 //FIXME h==4 case |
271 asm volatile( | |
272 "xorl %%eax, %%eax \n\t" | |
273 "movl %3, %%esi \n\t" | |
274 "1: \n\t" | |
275 "movq (%1, %%eax), %%mm0 \n\t" | |
276 "movq %%mm0, (%0, %%eax) \n\t" | |
277 "addl %2, %%eax \n\t" | |
278 "movq (%1, %%eax), %%mm0 \n\t" | |
279 "movq %%mm0, (%0, %%eax) \n\t" | |
280 "addl %2, %%eax \n\t" | |
281 "movq (%1, %%eax), %%mm0 \n\t" | |
282 "movq %%mm0, (%0, %%eax) \n\t" | |
283 "addl %2, %%eax \n\t" | |
284 "movq (%1, %%eax), %%mm0 \n\t" | |
285 "movq %%mm0, (%0, %%eax) \n\t" | |
286 "addl %2, %%eax \n\t" | |
287 "movq (%1, %%eax), %%mm0 \n\t" | |
288 "movq %%mm0, (%0, %%eax) \n\t" | |
289 "addl %2, %%eax \n\t" | |
290 "movq (%1, %%eax), %%mm0 \n\t" | |
291 "movq %%mm0, (%0, %%eax) \n\t" | |
292 "addl %2, %%eax \n\t" | |
293 "movq (%1, %%eax), %%mm0 \n\t" | |
294 "movq %%mm0, (%0, %%eax) \n\t" | |
295 "addl %2, %%eax \n\t" | |
296 "movq (%1, %%eax), %%mm0 \n\t" | |
297 "movq %%mm0, (%0, %%eax) \n\t" | |
298 "addl %2, %%eax \n\t" | |
299 "subl $8, %%esi \n\t" | |
300 " jnz 1b \n\t" | |
301 :: "r" (block), "r" (pixels), "r"(line_size), "m"(h) | |
302 : "%eax", "%esi", "memory" | |
303 ); | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
304 #else |
386 | 305 asm volatile( |
306 "xorl %%eax, %%eax \n\t" | |
307 "movl %3, %%esi \n\t" | |
308 "1: \n\t" | |
309 "movq (%1, %%eax), %%mm0 \n\t" | |
310 "movq %%mm0, (%0, %%eax) \n\t" | |
311 "addl %2, %%eax \n\t" | |
312 "movq (%1, %%eax), %%mm0 \n\t" | |
313 "movq %%mm0, (%0, %%eax) \n\t" | |
314 "addl %2, %%eax \n\t" | |
315 "movq (%1, %%eax), %%mm0 \n\t" | |
316 "movq %%mm0, (%0, %%eax) \n\t" | |
317 "addl %2, %%eax \n\t" | |
318 "movq (%1, %%eax), %%mm0 \n\t" | |
319 "movq %%mm0, (%0, %%eax) \n\t" | |
320 "addl %2, %%eax \n\t" | |
321 "subl $4, %%esi \n\t" | |
322 " jnz 1b \n\t" | |
323 :: "r" (block), "r" (pixels), "r"(line_size), "m"(h) | |
324 : "%eax", "%esi", "memory" | |
325 ); | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
326 #endif |
0 | 327 } |
328 | |
329 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
330 { | |
331 UINT8 *p; | |
332 const UINT8 *pix; | |
333 p = block; | |
334 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
335 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
336 MOVQ_WONE(mm4); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
337 JUMPALIGN(); |
0 | 338 do { |
339 __asm __volatile( | |
340 "movq %1, %%mm0\n\t" | |
341 "movq 1%1, %%mm1\n\t" | |
342 "movq %%mm0, %%mm2\n\t" | |
343 "movq %%mm1, %%mm3\n\t" | |
344 "punpcklbw %%mm7, %%mm0\n\t" | |
345 "punpcklbw %%mm7, %%mm1\n\t" | |
346 "punpckhbw %%mm7, %%mm2\n\t" | |
347 "punpckhbw %%mm7, %%mm3\n\t" | |
348 "paddusw %%mm1, %%mm0\n\t" | |
349 "paddusw %%mm3, %%mm2\n\t" | |
350 "paddusw %%mm4, %%mm0\n\t" | |
351 "paddusw %%mm4, %%mm2\n\t" | |
352 "psrlw $1, %%mm0\n\t" | |
353 "psrlw $1, %%mm2\n\t" | |
354 "packuswb %%mm2, %%mm0\n\t" | |
355 "movq %%mm0, %0\n\t" | |
356 :"=m"(*p) | |
357 :"m"(*pix) | |
358 :"memory"); | |
359 pix += line_size; p += line_size; | |
360 } while (--h); | |
361 } | |
362 | |
363 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
364 { | |
365 UINT8 *p; | |
366 const UINT8 *pix; | |
367 p = block; | |
368 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
369 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
370 MOVQ_WONE(mm4); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
371 JUMPALIGN(); |
0 | 372 do { |
373 __asm __volatile( | |
374 "movq %1, %%mm0\n\t" | |
375 "movq %2, %%mm1\n\t" | |
376 "movq %%mm0, %%mm2\n\t" | |
377 "movq %%mm1, %%mm3\n\t" | |
378 "punpcklbw %%mm7, %%mm0\n\t" | |
379 "punpcklbw %%mm7, %%mm1\n\t" | |
380 "punpckhbw %%mm7, %%mm2\n\t" | |
381 "punpckhbw %%mm7, %%mm3\n\t" | |
382 "paddusw %%mm1, %%mm0\n\t" | |
383 "paddusw %%mm3, %%mm2\n\t" | |
384 "paddusw %%mm4, %%mm0\n\t" | |
385 "paddusw %%mm4, %%mm2\n\t" | |
386 "psrlw $1, %%mm0\n\t" | |
387 "psrlw $1, %%mm2\n\t" | |
388 "packuswb %%mm2, %%mm0\n\t" | |
389 "movq %%mm0, %0\n\t" | |
390 :"=m"(*p) | |
391 :"m"(*pix), | |
392 "m"(*(pix+line_size)) | |
393 :"memory"); | |
394 pix += line_size; | |
395 p += line_size; | |
396 } while (--h); | |
397 } | |
398 | |
399 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
400 { | |
401 UINT8 *p; | |
402 const UINT8 *pix; | |
403 p = block; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
404 pix = pixels; // 1s |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
405 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
406 MOVQ_WTWO(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
407 JUMPALIGN(); |
0 | 408 do { |
409 __asm __volatile( | |
410 "movq %1, %%mm0\n\t" | |
411 "movq %2, %%mm1\n\t" | |
412 "movq 1%1, %%mm4\n\t" | |
413 "movq 1%2, %%mm5\n\t" | |
414 "movq %%mm0, %%mm2\n\t" | |
415 "movq %%mm1, %%mm3\n\t" | |
416 "punpcklbw %%mm7, %%mm0\n\t" | |
417 "punpcklbw %%mm7, %%mm1\n\t" | |
418 "punpckhbw %%mm7, %%mm2\n\t" | |
419 "punpckhbw %%mm7, %%mm3\n\t" | |
420 "paddusw %%mm1, %%mm0\n\t" | |
421 "paddusw %%mm3, %%mm2\n\t" | |
422 "movq %%mm4, %%mm1\n\t" | |
423 "movq %%mm5, %%mm3\n\t" | |
424 "punpcklbw %%mm7, %%mm4\n\t" | |
425 "punpcklbw %%mm7, %%mm5\n\t" | |
426 "punpckhbw %%mm7, %%mm1\n\t" | |
427 "punpckhbw %%mm7, %%mm3\n\t" | |
428 "paddusw %%mm5, %%mm4\n\t" | |
429 "paddusw %%mm3, %%mm1\n\t" | |
430 "paddusw %%mm6, %%mm4\n\t" | |
431 "paddusw %%mm6, %%mm1\n\t" | |
432 "paddusw %%mm4, %%mm0\n\t" | |
433 "paddusw %%mm1, %%mm2\n\t" | |
434 "psrlw $2, %%mm0\n\t" | |
435 "psrlw $2, %%mm2\n\t" | |
436 "packuswb %%mm2, %%mm0\n\t" | |
437 "movq %%mm0, %0\n\t" | |
438 :"=m"(*p) | |
439 :"m"(*pix), | |
440 "m"(*(pix+line_size)) | |
441 :"memory"); | |
442 pix += line_size; | |
443 p += line_size; | |
444 } while(--h); | |
445 } | |
446 | |
447 static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
448 { | |
449 UINT8 *p; | |
450 const UINT8 *pix; | |
451 p = block; | |
452 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
453 MOVQ_ZERO(mm7); |
0 | 454 do { |
455 __asm __volatile( | |
456 "movq %1, %%mm0\n\t" | |
457 "movq 1%1, %%mm1\n\t" | |
458 "movq %%mm0, %%mm2\n\t" | |
459 "movq %%mm1, %%mm3\n\t" | |
460 "punpcklbw %%mm7, %%mm0\n\t" | |
461 "punpcklbw %%mm7, %%mm1\n\t" | |
462 "punpckhbw %%mm7, %%mm2\n\t" | |
463 "punpckhbw %%mm7, %%mm3\n\t" | |
464 "paddusw %%mm1, %%mm0\n\t" | |
465 "paddusw %%mm3, %%mm2\n\t" | |
466 "psrlw $1, %%mm0\n\t" | |
467 "psrlw $1, %%mm2\n\t" | |
468 "packuswb %%mm2, %%mm0\n\t" | |
469 "movq %%mm0, %0\n\t" | |
470 :"=m"(*p) | |
471 :"m"(*pix) | |
472 :"memory"); | |
473 pix += line_size; | |
474 p += line_size; | |
475 } while (--h); | |
476 } | |
477 | |
478 static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
479 { | |
480 UINT8 *p; | |
481 const UINT8 *pix; | |
482 p = block; | |
483 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
484 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
485 JUMPALIGN(); |
0 | 486 do { |
487 __asm __volatile( | |
488 "movq %1, %%mm0\n\t" | |
489 "movq %2, %%mm1\n\t" | |
490 "movq %%mm0, %%mm2\n\t" | |
491 "movq %%mm1, %%mm3\n\t" | |
492 "punpcklbw %%mm7, %%mm0\n\t" | |
493 "punpcklbw %%mm7, %%mm1\n\t" | |
494 "punpckhbw %%mm7, %%mm2\n\t" | |
495 "punpckhbw %%mm7, %%mm3\n\t" | |
496 "paddusw %%mm1, %%mm0\n\t" | |
497 "paddusw %%mm3, %%mm2\n\t" | |
498 "psrlw $1, %%mm0\n\t" | |
499 "psrlw $1, %%mm2\n\t" | |
500 "packuswb %%mm2, %%mm0\n\t" | |
501 "movq %%mm0, %0\n\t" | |
502 :"=m"(*p) | |
503 :"m"(*pix), | |
504 "m"(*(pix+line_size)) | |
505 :"memory"); | |
506 pix += line_size; | |
507 p += line_size; | |
508 } while(--h); | |
509 } | |
510 | |
511 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
512 { | |
513 UINT8 *p; | |
514 const UINT8 *pix; | |
515 p = block; | |
516 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
517 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
518 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
519 JUMPALIGN(); |
0 | 520 do { |
521 __asm __volatile( | |
522 "movq %1, %%mm0\n\t" | |
523 "movq %2, %%mm1\n\t" | |
524 "movq 1%1, %%mm4\n\t" | |
525 "movq 1%2, %%mm5\n\t" | |
526 "movq %%mm0, %%mm2\n\t" | |
527 "movq %%mm1, %%mm3\n\t" | |
528 "punpcklbw %%mm7, %%mm0\n\t" | |
529 "punpcklbw %%mm7, %%mm1\n\t" | |
530 "punpckhbw %%mm7, %%mm2\n\t" | |
531 "punpckhbw %%mm7, %%mm3\n\t" | |
532 "paddusw %%mm1, %%mm0\n\t" | |
533 "paddusw %%mm3, %%mm2\n\t" | |
534 "movq %%mm4, %%mm1\n\t" | |
535 "movq %%mm5, %%mm3\n\t" | |
536 "punpcklbw %%mm7, %%mm4\n\t" | |
537 "punpcklbw %%mm7, %%mm5\n\t" | |
538 "punpckhbw %%mm7, %%mm1\n\t" | |
539 "punpckhbw %%mm7, %%mm3\n\t" | |
540 "paddusw %%mm5, %%mm4\n\t" | |
541 "paddusw %%mm3, %%mm1\n\t" | |
542 "paddusw %%mm6, %%mm4\n\t" | |
543 "paddusw %%mm6, %%mm1\n\t" | |
544 "paddusw %%mm4, %%mm0\n\t" | |
545 "paddusw %%mm1, %%mm2\n\t" | |
546 "psrlw $2, %%mm0\n\t" | |
547 "psrlw $2, %%mm2\n\t" | |
548 "packuswb %%mm2, %%mm0\n\t" | |
549 "movq %%mm0, %0\n\t" | |
550 :"=m"(*p) | |
551 :"m"(*pix), | |
552 "m"(*(pix+line_size)) | |
553 :"memory"); | |
554 pix += line_size; | |
555 p += line_size; | |
556 } while(--h); | |
557 } | |
558 | |
559 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
560 { | |
561 UINT8 *p; | |
562 const UINT8 *pix; | |
563 p = block; | |
564 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
565 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
566 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
567 JUMPALIGN(); |
0 | 568 do { |
569 __asm __volatile( | |
570 "movq %0, %%mm0\n\t" | |
571 "movq %1, %%mm1\n\t" | |
572 "movq %%mm0, %%mm2\n\t" | |
573 "movq %%mm1, %%mm3\n\t" | |
574 "punpcklbw %%mm7, %%mm0\n\t" | |
575 "punpcklbw %%mm7, %%mm1\n\t" | |
576 "punpckhbw %%mm7, %%mm2\n\t" | |
577 "punpckhbw %%mm7, %%mm3\n\t" | |
578 "paddusw %%mm1, %%mm0\n\t" | |
579 "paddusw %%mm3, %%mm2\n\t" | |
580 "paddusw %%mm6, %%mm0\n\t" | |
581 "paddusw %%mm6, %%mm2\n\t" | |
582 "psrlw $1, %%mm0\n\t" | |
583 "psrlw $1, %%mm2\n\t" | |
584 "packuswb %%mm2, %%mm0\n\t" | |
585 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
586 :"+m"(*p) |
0 | 587 :"m"(*pix) |
588 :"memory"); | |
589 pix += line_size; | |
590 p += line_size; | |
591 } | |
592 while (--h); | |
593 } | |
594 | |
595 static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
596 { | |
597 UINT8 *p; | |
598 const UINT8 *pix; | |
599 p = block; | |
600 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
601 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
602 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
603 JUMPALIGN(); |
0 | 604 do { |
605 __asm __volatile( | |
606 "movq %1, %%mm1\n\t" | |
607 "movq %0, %%mm0\n\t" | |
608 "movq 1%1, %%mm4\n\t" | |
609 "movq %%mm0, %%mm2\n\t" | |
610 "movq %%mm1, %%mm3\n\t" | |
611 "movq %%mm4, %%mm5\n\t" | |
612 "punpcklbw %%mm7, %%mm1\n\t" | |
613 "punpckhbw %%mm7, %%mm3\n\t" | |
614 "punpcklbw %%mm7, %%mm4\n\t" | |
615 "punpckhbw %%mm7, %%mm5\n\t" | |
616 "punpcklbw %%mm7, %%mm0\n\t" | |
617 "punpckhbw %%mm7, %%mm2\n\t" | |
618 "paddusw %%mm4, %%mm1\n\t" | |
619 "paddusw %%mm5, %%mm3\n\t" | |
620 "paddusw %%mm6, %%mm1\n\t" | |
621 "paddusw %%mm6, %%mm3\n\t" | |
622 "psrlw $1, %%mm1\n\t" | |
623 "psrlw $1, %%mm3\n\t" | |
624 "paddusw %%mm6, %%mm0\n\t" | |
625 "paddusw %%mm6, %%mm2\n\t" | |
626 "paddusw %%mm1, %%mm0\n\t" | |
627 "paddusw %%mm3, %%mm2\n\t" | |
628 "psrlw $1, %%mm0\n\t" | |
629 "psrlw $1, %%mm2\n\t" | |
630 "packuswb %%mm2, %%mm0\n\t" | |
631 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
632 :"+m"(*p) |
0 | 633 :"m"(*pix) |
634 :"memory"); | |
635 pix += line_size; | |
636 p += line_size; | |
637 } while (--h); | |
638 } | |
639 | |
640 static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
641 { | |
642 UINT8 *p; | |
643 const UINT8 *pix; | |
644 p = block; | |
645 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
646 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
647 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
648 JUMPALIGN(); |
0 | 649 do { |
650 __asm __volatile( | |
651 "movq %1, %%mm1\n\t" | |
652 "movq %0, %%mm0\n\t" | |
653 "movq %2, %%mm4\n\t" | |
654 "movq %%mm0, %%mm2\n\t" | |
655 "movq %%mm1, %%mm3\n\t" | |
656 "movq %%mm4, %%mm5\n\t" | |
657 "punpcklbw %%mm7, %%mm1\n\t" | |
658 "punpckhbw %%mm7, %%mm3\n\t" | |
659 "punpcklbw %%mm7, %%mm4\n\t" | |
660 "punpckhbw %%mm7, %%mm5\n\t" | |
661 "punpcklbw %%mm7, %%mm0\n\t" | |
662 "punpckhbw %%mm7, %%mm2\n\t" | |
663 "paddusw %%mm4, %%mm1\n\t" | |
664 "paddusw %%mm5, %%mm3\n\t" | |
665 "paddusw %%mm6, %%mm1\n\t" | |
666 "paddusw %%mm6, %%mm3\n\t" | |
667 "psrlw $1, %%mm1\n\t" | |
668 "psrlw $1, %%mm3\n\t" | |
669 "paddusw %%mm6, %%mm0\n\t" | |
670 "paddusw %%mm6, %%mm2\n\t" | |
671 "paddusw %%mm1, %%mm0\n\t" | |
672 "paddusw %%mm3, %%mm2\n\t" | |
673 "psrlw $1, %%mm0\n\t" | |
674 "psrlw $1, %%mm2\n\t" | |
675 "packuswb %%mm2, %%mm0\n\t" | |
676 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
677 :"+m"(*p) |
0 | 678 :"m"(*pix), "m"(*(pix+line_size)) |
679 :"memory"); | |
680 pix += line_size; | |
681 p += line_size ; | |
682 } while(--h); | |
683 } | |
684 | |
685 static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
686 { | |
687 UINT8 *p; | |
688 const UINT8 *pix; | |
689 p = block; | |
690 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
691 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
692 // this doesn't seem to be used offten - so |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
693 // the inside usage of mm_wone is not optimized |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
694 MOVQ_WTWO(mm6); |
0 | 695 do { |
696 __asm __volatile( | |
697 "movq %1, %%mm0\n\t" | |
698 "movq %2, %%mm1\n\t" | |
699 "movq 1%1, %%mm4\n\t" | |
700 "movq 1%2, %%mm5\n\t" | |
701 "movq %%mm0, %%mm2\n\t" | |
702 "movq %%mm1, %%mm3\n\t" | |
703 "punpcklbw %%mm7, %%mm0\n\t" | |
704 "punpcklbw %%mm7, %%mm1\n\t" | |
705 "punpckhbw %%mm7, %%mm2\n\t" | |
706 "punpckhbw %%mm7, %%mm3\n\t" | |
707 "paddusw %%mm1, %%mm0\n\t" | |
708 "paddusw %%mm3, %%mm2\n\t" | |
709 "movq %%mm4, %%mm1\n\t" | |
710 "movq %%mm5, %%mm3\n\t" | |
711 "punpcklbw %%mm7, %%mm4\n\t" | |
712 "punpcklbw %%mm7, %%mm5\n\t" | |
713 "punpckhbw %%mm7, %%mm1\n\t" | |
714 "punpckhbw %%mm7, %%mm3\n\t" | |
715 "paddusw %%mm5, %%mm4\n\t" | |
716 "paddusw %%mm3, %%mm1\n\t" | |
717 "paddusw %%mm6, %%mm4\n\t" | |
718 "paddusw %%mm6, %%mm1\n\t" | |
719 "paddusw %%mm4, %%mm0\n\t" | |
720 "paddusw %%mm1, %%mm2\n\t" | |
721 "movq %3, %%mm5\n\t" | |
722 "psrlw $2, %%mm0\n\t" | |
723 "movq %0, %%mm1\n\t" | |
724 "psrlw $2, %%mm2\n\t" | |
725 "movq %%mm1, %%mm3\n\t" | |
726 "punpcklbw %%mm7, %%mm1\n\t" | |
727 "punpckhbw %%mm7, %%mm3\n\t" | |
728 "paddusw %%mm1, %%mm0\n\t" | |
729 "paddusw %%mm3, %%mm2\n\t" | |
730 "paddusw %%mm5, %%mm0\n\t" | |
731 "paddusw %%mm5, %%mm2\n\t" | |
732 "psrlw $1, %%mm0\n\t" | |
733 "psrlw $1, %%mm2\n\t" | |
734 "packuswb %%mm2, %%mm0\n\t" | |
735 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
736 :"+m"(*p) |
0 | 737 :"m"(*pix), |
8 | 738 "m"(*(pix+line_size)), "m"(mm_wone) |
0 | 739 :"memory"); |
740 pix += line_size; | |
741 p += line_size ; | |
742 } while(--h); | |
743 } | |
744 | |
745 static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
746 { | |
747 UINT8 *p; | |
748 const UINT8 *pix; | |
749 p = block; | |
750 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
751 MOVQ_ZERO(mm7); |
0 | 752 do { |
753 __asm __volatile( | |
754 "movq %1, %%mm0\n\t" | |
755 "movq %0, %%mm1\n\t" | |
756 "movq %%mm0, %%mm2\n\t" | |
757 "movq %%mm1, %%mm3\n\t" | |
758 "punpcklbw %%mm7, %%mm0\n\t" | |
759 "punpcklbw %%mm7, %%mm1\n\t" | |
760 "punpckhbw %%mm7, %%mm2\n\t" | |
761 "punpckhbw %%mm7, %%mm3\n\t" | |
762 "paddusw %%mm1, %%mm0\n\t" | |
763 "paddusw %%mm3, %%mm2\n\t" | |
764 "psrlw $1, %%mm0\n\t" | |
765 "psrlw $1, %%mm2\n\t" | |
766 "packuswb %%mm2, %%mm0\n\t" | |
767 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
768 :"+m"(*p) |
0 | 769 :"m"(*pix) |
770 :"memory"); | |
771 pix += line_size; | |
772 p += line_size ; | |
773 } while (--h); | |
774 } | |
775 | |
776 static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
777 { | |
778 UINT8 *p; | |
779 const UINT8 *pix; | |
780 p = block; | |
781 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
782 MOVQ_ZERO(mm7); |
0 | 783 do { |
784 __asm __volatile( | |
785 "movq %1, %%mm0\n\t" | |
786 "movq 1%1, %%mm1\n\t" | |
787 "movq %0, %%mm4\n\t" | |
788 "movq %%mm0, %%mm2\n\t" | |
789 "movq %%mm1, %%mm3\n\t" | |
790 "movq %%mm4, %%mm5\n\t" | |
791 "punpcklbw %%mm7, %%mm0\n\t" | |
792 "punpcklbw %%mm7, %%mm1\n\t" | |
793 "punpckhbw %%mm7, %%mm2\n\t" | |
794 "punpckhbw %%mm7, %%mm3\n\t" | |
795 "punpcklbw %%mm7, %%mm4\n\t" | |
796 "punpckhbw %%mm7, %%mm5\n\t" | |
797 "paddusw %%mm1, %%mm0\n\t" | |
798 "paddusw %%mm3, %%mm2\n\t" | |
799 "psrlw $1, %%mm0\n\t" | |
800 "psrlw $1, %%mm2\n\t" | |
801 "paddusw %%mm4, %%mm0\n\t" | |
802 "paddusw %%mm5, %%mm2\n\t" | |
803 "psrlw $1, %%mm0\n\t" | |
804 "psrlw $1, %%mm2\n\t" | |
805 "packuswb %%mm2, %%mm0\n\t" | |
806 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
807 :"+m"(*p) |
0 | 808 :"m"(*pix) |
809 :"memory"); | |
810 pix += line_size; | |
811 p += line_size; | |
812 } while (--h); | |
813 } | |
814 | |
815 static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
816 { | |
817 UINT8 *p; | |
818 const UINT8 *pix; | |
819 p = block; | |
820 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
821 MOVQ_ZERO(mm7); |
0 | 822 do { |
823 __asm __volatile( | |
824 "movq %1, %%mm0\n\t" | |
825 "movq %2, %%mm1\n\t" | |
826 "movq %0, %%mm4\n\t" | |
827 "movq %%mm0, %%mm2\n\t" | |
828 "movq %%mm1, %%mm3\n\t" | |
829 "movq %%mm4, %%mm5\n\t" | |
830 "punpcklbw %%mm7, %%mm0\n\t" | |
831 "punpcklbw %%mm7, %%mm1\n\t" | |
832 "punpckhbw %%mm7, %%mm2\n\t" | |
833 "punpckhbw %%mm7, %%mm3\n\t" | |
834 "punpcklbw %%mm7, %%mm4\n\t" | |
835 "punpckhbw %%mm7, %%mm5\n\t" | |
836 "paddusw %%mm1, %%mm0\n\t" | |
837 "paddusw %%mm3, %%mm2\n\t" | |
838 "psrlw $1, %%mm0\n\t" | |
839 "psrlw $1, %%mm2\n\t" | |
840 "paddusw %%mm4, %%mm0\n\t" | |
841 "paddusw %%mm5, %%mm2\n\t" | |
842 "psrlw $1, %%mm0\n\t" | |
843 "psrlw $1, %%mm2\n\t" | |
844 "packuswb %%mm2, %%mm0\n\t" | |
845 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
846 :"+m"(*p) |
0 | 847 :"m"(*pix), "m"(*(pix+line_size)) |
848 :"memory"); | |
849 pix += line_size; | |
850 p += line_size ; | |
851 } while(--h); | |
852 } | |
853 | |
854 static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
855 { | |
856 UINT8 *p; | |
857 const UINT8 *pix; | |
858 p = block; | |
859 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
860 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
861 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
862 JUMPALIGN(); |
0 | 863 do { |
864 __asm __volatile( | |
865 "movq %1, %%mm0\n\t" | |
866 "movq %2, %%mm1\n\t" | |
867 "movq 1%1, %%mm4\n\t" | |
868 "movq 1%2, %%mm5\n\t" | |
869 "movq %%mm0, %%mm2\n\t" | |
870 "movq %%mm1, %%mm3\n\t" | |
871 "punpcklbw %%mm7, %%mm0\n\t" | |
872 "punpcklbw %%mm7, %%mm1\n\t" | |
873 "punpckhbw %%mm7, %%mm2\n\t" | |
874 "punpckhbw %%mm7, %%mm3\n\t" | |
875 "paddusw %%mm1, %%mm0\n\t" | |
876 "paddusw %%mm3, %%mm2\n\t" | |
877 "movq %%mm4, %%mm1\n\t" | |
878 "movq %%mm5, %%mm3\n\t" | |
879 "punpcklbw %%mm7, %%mm4\n\t" | |
880 "punpcklbw %%mm7, %%mm5\n\t" | |
881 "punpckhbw %%mm7, %%mm1\n\t" | |
882 "punpckhbw %%mm7, %%mm3\n\t" | |
883 "paddusw %%mm5, %%mm4\n\t" | |
884 "paddusw %%mm3, %%mm1\n\t" | |
885 "paddusw %%mm6, %%mm4\n\t" | |
886 "paddusw %%mm6, %%mm1\n\t" | |
887 "paddusw %%mm4, %%mm0\n\t" | |
888 "paddusw %%mm1, %%mm2\n\t" | |
889 "movq %0, %%mm1\n\t" | |
890 "psrlw $2, %%mm0\n\t" | |
891 "movq %%mm1, %%mm3\n\t" | |
892 "psrlw $2, %%mm2\n\t" | |
893 "punpcklbw %%mm7, %%mm1\n\t" | |
894 "punpckhbw %%mm7, %%mm3\n\t" | |
895 "paddusw %%mm1, %%mm0\n\t" | |
896 "paddusw %%mm3, %%mm2\n\t" | |
897 "psrlw $1, %%mm0\n\t" | |
898 "psrlw $1, %%mm2\n\t" | |
899 "packuswb %%mm2, %%mm0\n\t" | |
900 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
901 :"+m"(*p) |
0 | 902 :"m"(*pix), |
903 "m"(*(pix+line_size)) | |
904 :"memory"); | |
905 pix += line_size; | |
906 p += line_size; | |
907 } while(--h); | |
908 } | |
909 | |
296 | 910 static void clear_blocks_mmx(DCTELEM *blocks) |
911 { | |
912 asm volatile( | |
913 "pxor %%mm7, %%mm7 \n\t" | |
914 "movl $-128*6, %%eax \n\t" | |
915 "1: \n\t" | |
916 "movq %%mm7, (%0, %%eax) \n\t" | |
917 "movq %%mm7, 8(%0, %%eax) \n\t" | |
918 "movq %%mm7, 16(%0, %%eax) \n\t" | |
919 "movq %%mm7, 24(%0, %%eax) \n\t" | |
920 "addl $32, %%eax \n\t" | |
921 " js 1b \n\t" | |
922 : : "r" (((int)blocks)+128*6) | |
923 : "%eax" | |
924 ); | |
925 } | |
926 | |
393 | 927 #if 0 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
928 static void just_return() { return; } |
393 | 929 #endif |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
930 |
415 | 931 #ifndef TESTCPU_MAIN |
0 | 932 void dsputil_init_mmx(void) |
933 { | |
934 mm_flags = mm_support(); | |
188 | 935 #if 1 |
936 printf("libavcodec: CPU flags:"); | |
0 | 937 if (mm_flags & MM_MMX) |
938 printf(" mmx"); | |
939 if (mm_flags & MM_MMXEXT) | |
940 printf(" mmxext"); | |
941 if (mm_flags & MM_3DNOW) | |
942 printf(" 3dnow"); | |
943 if (mm_flags & MM_SSE) | |
944 printf(" sse"); | |
945 if (mm_flags & MM_SSE2) | |
946 printf(" sse2"); | |
947 printf("\n"); | |
948 #endif | |
949 | |
950 if (mm_flags & MM_MMX) { | |
951 get_pixels = get_pixels_mmx; | |
324 | 952 diff_pixels = diff_pixels_mmx; |
0 | 953 put_pixels_clamped = put_pixels_clamped_mmx; |
954 add_pixels_clamped = add_pixels_clamped_mmx; | |
296 | 955 clear_blocks= clear_blocks_mmx; |
415 | 956 |
294 | 957 pix_abs16x16 = pix_abs16x16_mmx; |
958 pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
959 pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
0 | 960 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; |
294 | 961 pix_abs8x8 = pix_abs8x8_mmx; |
962 pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | |
963 pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | |
964 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | |
0 | 965 av_fdct = fdct_mmx; |
966 | |
967 put_pixels_tab[0] = put_pixels_mmx; | |
968 put_pixels_tab[1] = put_pixels_x2_mmx; | |
969 put_pixels_tab[2] = put_pixels_y2_mmx; | |
970 put_pixels_tab[3] = put_pixels_xy2_mmx; | |
971 | |
972 put_no_rnd_pixels_tab[0] = put_pixels_mmx; | |
973 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
974 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
975 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | |
415 | 976 |
0 | 977 avg_pixels_tab[0] = avg_pixels_mmx; |
978 avg_pixels_tab[1] = avg_pixels_x2_mmx; | |
979 avg_pixels_tab[2] = avg_pixels_y2_mmx; | |
980 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
981 | |
982 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | |
983 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | |
984 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | |
985 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | |
386 | 986 |
0 | 987 if (mm_flags & MM_MMXEXT) { |
294 | 988 pix_abs16x16 = pix_abs16x16_mmx2; |
989 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; | |
990 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; | |
991 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2; | |
415 | 992 |
294 | 993 pix_abs8x8 = pix_abs8x8_mmx2; |
994 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; | |
995 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; | |
996 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; | |
386 | 997 |
998 put_pixels_tab[1] = put_pixels_x2_mmx2; | |
999 put_pixels_tab[2] = put_pixels_y2_mmx2; | |
1000 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2; | |
1001 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2; | |
415 | 1002 |
386 | 1003 avg_pixels_tab[0] = avg_pixels_mmx2; |
1004 avg_pixels_tab[1] = avg_pixels_x2_mmx2; | |
1005 avg_pixels_tab[2] = avg_pixels_y2_mmx2; | |
1006 avg_pixels_tab[3] = avg_pixels_xy2_mmx2; | |
0 | 1007 } else if (mm_flags & MM_3DNOW) { |
1008 put_pixels_tab[1] = put_pixels_x2_3dnow; | |
1009 put_pixels_tab[2] = put_pixels_y2_3dnow; | |
386 | 1010 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow; |
1011 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow; | |
393 | 1012 |
0 | 1013 avg_pixels_tab[0] = avg_pixels_3dnow; |
1014 avg_pixels_tab[1] = avg_pixels_x2_3dnow; | |
1015 avg_pixels_tab[2] = avg_pixels_y2_3dnow; | |
1016 avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | |
1017 } | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1018 |
42 | 1019 /* idct */ |
1020 if (mm_flags & MM_MMXEXT) { | |
1021 ff_idct = ff_mmxext_idct; | |
1022 } else { | |
1023 ff_idct = ff_mmx_idct; | |
1024 } | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1025 #ifdef SIMPLE_IDCT |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1026 // ff_idct = simple_idct; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1027 ff_idct = simple_idct_mmx; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1028 #endif |
0 | 1029 } |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1030 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1031 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1032 // for speed testing |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1033 get_pixels = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1034 put_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1035 add_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1036 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1037 pix_abs16x16 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1038 pix_abs16x16_x2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1039 pix_abs16x16_y2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1040 pix_abs16x16_xy2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1041 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1042 put_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1043 put_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1044 put_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1045 put_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1046 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1047 put_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1048 put_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1049 put_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1050 put_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1051 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1052 avg_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1053 avg_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1054 avg_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1055 avg_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1056 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1057 avg_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1058 avg_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1059 avg_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1060 avg_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1061 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1062 //av_fdct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1063 //ff_idct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1064 #endif |
0 | 1065 } |
402 | 1066 |
1067 /* remove any non bit exact operation (testing purpose). NOTE that | |
1068 this function should be kept as small as possible because it is | |
1069 always difficult to test automatically non bit exact cases. */ | |
1070 void dsputil_set_bit_exact_mmx(void) | |
1071 { | |
1072 if (mm_flags & MM_MMX) { | |
1073 if (mm_flags & MM_MMXEXT) { | |
1074 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
1075 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1076 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1077 } else if (mm_flags & MM_3DNOW) { | |
1078 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
1079 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1080 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1081 } | |
1082 } | |
1083 } | |
415 | 1084 |
1085 #else // TESTCPU_MAIN | |
1086 /* | |
1087 * for testing speed of various routine - should be probably extended | |
1088 * for a general purpose regression test later | |
1089 * | |
1090 * for now use it this way: | |
1091 * | |
1092 * gcc -O4 -fomit-frame-pointer -DHAVE_AV_CONFIG_H -DTESTCPU_MAIN -I../.. -o test dsputil_mmx.c | |
1093 * | |
1094 * in libavcodec/i386 directory - then run ./test | |
1095 */ | |
1096 static inline long long rdtsc() | |
1097 { | |
1098 long long l; | |
1099 asm volatile( "rdtsc\n\t" | |
1100 : "=A" (l) | |
1101 ); | |
1102 return l; | |
1103 } | |
1104 | |
1105 int main(int argc, char* argv[]) | |
1106 { | |
1107 volatile int v; | |
1108 int i; | |
1109 const int linesize = 720; | |
417 | 1110 char empty[32768]; |
415 | 1111 uint64_t te, ts = rdtsc(); |
417 | 1112 char* im, *bu = empty; |
415 | 1113 op_pixels_func fc = put_pixels_y2_mmx2; |
417 | 1114 bu += 32; |
1115 bu =(char*)(((long)bu) & ~0xf); // 16 bytes alignment | |
1116 im = bu; | |
415 | 1117 for(i=0; i<1000000; i++){ |
1118 fc(im, im + 1000, linesize, 16); | |
417 | 1119 im += 4; // |
415 | 1120 if (im > bu + 10000) |
1121 im = bu; | |
1122 } | |
1123 te = rdtsc(); | |
1124 printf("CPU Ticks: %7d\n", (int)(te - ts)); | |
1125 } | |
1126 #endif |