Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 466:805714c0c10f libavcodec
new field for communicating with external postprocessing
author | nickols_k |
---|---|
date | Mon, 03 Jun 2002 07:01:42 +0000 |
parents | e8c8ca9106aa |
children | d7f65ea52aaa |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
0 | 4 * |
429 | 5 * This library is free software; you can redistribute it and/or |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2 of the License, or (at your option) any later version. | |
0 | 9 * |
429 | 10 * This library is distributed in the hope that it will be useful, |
0 | 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 * Lesser General Public License for more details. | |
0 | 14 * |
429 | 15 * You should have received a copy of the GNU Lesser General Public |
16 * License along with this library; if not, write to the Free Software | |
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
0 | 18 * |
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 */ | |
21 | |
22 #include "../dsputil.h" | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
23 #include "../simple_idct.h" |
0 | 24 |
5 | 25 int mm_flags; /* multimedia extension flags */ |
26 | |
294 | 27 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx); |
28 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
29 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
30 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
31 | |
32 int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
33 int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
34 int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
35 int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
36 | |
37 int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
38 int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
39 int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
40 int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
41 | |
42 int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
43 int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
44 int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
45 int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
46 | |
42 | 47 /* external functions, from idct_mmx.c */ |
48 void ff_mmx_idct(DCTELEM *block); | |
49 void ff_mmxext_idct(DCTELEM *block); | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
50 |
0 | 51 /* pixel operations */ |
387 | 52 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; |
53 static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | |
54 static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL; | |
0 | 55 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
56 #define JUMPALIGN() __asm __volatile (".balign 8"::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
57 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
58 |
448 | 59 #define MOVQ_WONE(regd) \ |
60 __asm __volatile ( \ | |
61 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
62 "psrlw $15, %%" #regd ::) | |
63 | |
64 #define MOVQ_BFE(regd) \ | |
65 __asm __volatile ( \ | |
66 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ | |
67 "paddb %%" #regd ", %%" #regd " \n\t" ::) | |
68 | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
69 #ifndef PIC |
448 | 70 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone)) |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
71 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
72 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
73 // for shared library it's better to use this way for accessing constants |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
74 // pcmpeqd -> -1 |
448 | 75 #define MOVQ_BONE(regd) \ |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
76 __asm __volatile ( \ |
448 | 77 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
78 "psrlw $15, %%" #regd " \n\t" \ | |
79 "packuswb %%" #regd ", %%" #regd " \n\t" ::) | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
80 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
81 #define MOVQ_WTWO(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
82 __asm __volatile ( \ |
448 | 83 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
84 "psrlw $15, %%" #regd " \n\t" \ | |
85 "psllw $1, %%" #regd " \n\t"::) | |
387 | 86 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
87 #endif |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
88 |
448 | 89 // using regr as temporary and for the output result |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
90 // first argument is unmodifed and second is trashed |
448 | 91 // mm6 is supposed to contain 0xfefefefefefefefe |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
92 #define PAVGB_MMX_NO_RND(rega, regb, regr) \ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
93 "movq " #rega ", " #regr " \n\t"\ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
94 "pand " #regb ", " #regr " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
95 "pxor " #rega ", " #regb " \n\t"\ |
448 | 96 "pand %%mm6, " #regb " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
97 "psrlq $1, " #regb " \n\t"\ |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
98 "paddb " #regb ", " #regr " \n\t" |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
99 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
100 #define PAVGB_MMX(rega, regb, regr) \ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
101 "movq " #rega ", " #regr " \n\t"\ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
102 "por " #regb ", " #regr " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
103 "pxor " #rega ", " #regb " \n\t"\ |
448 | 104 "pand %%mm6, " #regb " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
105 "psrlq $1, " #regb " \n\t"\ |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
106 "psubb " #regb ", " #regr " \n\t" |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
107 |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
108 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
109 "movq " #rega ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
110 "movq " #regc ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
111 "pand " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
112 "pand " #regd ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
113 "pxor " #rega ", " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
114 "pxor " #regc ", " #regd " \n\t"\ |
448 | 115 "pand %%mm6, " #regb " \n\t"\ |
116 "pand %%mm6, " #regd " \n\t"\ | |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
117 "psrlq $1, " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
118 "psrlq $1, " #regd " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
119 "paddb " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
120 "paddb " #regd ", " #regp " \n\t" |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
121 |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
122 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
123 "movq " #rega ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
124 "movq " #regc ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
125 "por " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
126 "por " #regd ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
127 "pxor " #rega ", " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
128 "pxor " #regc ", " #regd " \n\t"\ |
448 | 129 "pand %%mm6, " #regb " \n\t"\ |
130 "pand %%mm6, " #regd " \n\t"\ | |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
131 "psrlq $1, " #regd " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
132 "psrlq $1, " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
133 "psubb " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
134 "psubb " #regd ", " #regp " \n\t" |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
135 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
136 /***********************************/ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
137 /* MMX no rounding */ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
138 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx |
448 | 139 #define SET_RND MOVQ_WONE |
140 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) | |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
141 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
142 #include "dsputil_mmx_rnd.h" |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
143 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
144 #undef DEF |
448 | 145 #undef SET_RND |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
146 #undef PAVGBP |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
147 /***********************************/ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
148 /* MMX rounding */ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
149 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
150 #define DEF(x, y) x ## _ ## y ##_mmx |
448 | 151 #define SET_RND MOVQ_WTWO |
152 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) | |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
153 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
154 #include "dsputil_mmx_rnd.h" |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
155 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
156 #undef DEF |
448 | 157 #undef SET_RND |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
158 #undef PAVGBP |
387 | 159 |
0 | 160 /***********************************/ |
161 /* 3Dnow specific */ | |
162 | |
163 #define DEF(x) x ## _3dnow | |
164 /* for Athlons PAVGUSB is prefered */ | |
165 #define PAVGB "pavgusb" | |
166 | |
167 #include "dsputil_mmx_avg.h" | |
168 | |
169 #undef DEF | |
170 #undef PAVGB | |
171 | |
172 /***********************************/ | |
173 /* MMX2 specific */ | |
174 | |
386 | 175 #define DEF(x) x ## _mmx2 |
0 | 176 |
177 /* Introduced only in MMX2 set */ | |
178 #define PAVGB "pavgb" | |
179 | |
180 #include "dsputil_mmx_avg.h" | |
181 | |
182 #undef DEF | |
183 #undef PAVGB | |
184 | |
185 /***********************************/ | |
186 /* standard MMX */ | |
187 | |
188 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
189 { | |
386 | 190 asm volatile( |
191 "movl $-128, %%eax \n\t" | |
192 "pxor %%mm7, %%mm7 \n\t" | |
193 ".balign 16 \n\t" | |
194 "1: \n\t" | |
195 "movq (%0), %%mm0 \n\t" | |
196 "movq (%0, %2), %%mm2 \n\t" | |
197 "movq %%mm0, %%mm1 \n\t" | |
198 "movq %%mm2, %%mm3 \n\t" | |
199 "punpcklbw %%mm7, %%mm0 \n\t" | |
200 "punpckhbw %%mm7, %%mm1 \n\t" | |
201 "punpcklbw %%mm7, %%mm2 \n\t" | |
202 "punpckhbw %%mm7, %%mm3 \n\t" | |
203 "movq %%mm0, (%1, %%eax)\n\t" | |
204 "movq %%mm1, 8(%1, %%eax)\n\t" | |
205 "movq %%mm2, 16(%1, %%eax)\n\t" | |
206 "movq %%mm3, 24(%1, %%eax)\n\t" | |
207 "addl %3, %0 \n\t" | |
208 "addl $32, %%eax \n\t" | |
209 "js 1b \n\t" | |
210 : "+r" (pixels) | |
211 : "r" (block+64), "r" (line_size), "r" (line_size*2) | |
212 : "%eax" | |
213 ); | |
0 | 214 } |
215 | |
324 | 216 static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) |
217 { | |
218 asm volatile( | |
386 | 219 "pxor %%mm7, %%mm7 \n\t" |
220 "movl $-128, %%eax \n\t" | |
324 | 221 ".balign 16 \n\t" |
222 "1: \n\t" | |
223 "movq (%0), %%mm0 \n\t" | |
224 "movq (%1), %%mm2 \n\t" | |
225 "movq %%mm0, %%mm1 \n\t" | |
226 "movq %%mm2, %%mm3 \n\t" | |
227 "punpcklbw %%mm7, %%mm0 \n\t" | |
228 "punpckhbw %%mm7, %%mm1 \n\t" | |
229 "punpcklbw %%mm7, %%mm2 \n\t" | |
230 "punpckhbw %%mm7, %%mm3 \n\t" | |
231 "psubw %%mm2, %%mm0 \n\t" | |
232 "psubw %%mm3, %%mm1 \n\t" | |
233 "movq %%mm0, (%2, %%eax)\n\t" | |
234 "movq %%mm1, 8(%2, %%eax)\n\t" | |
235 "addl %3, %0 \n\t" | |
236 "addl %3, %1 \n\t" | |
237 "addl $16, %%eax \n\t" | |
238 "jnz 1b \n\t" | |
239 : "+r" (s1), "+r" (s2) | |
240 : "r" (block+64), "r" (stride) | |
241 : "%eax" | |
242 ); | |
243 } | |
244 | |
0 | 245 static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) |
246 { | |
247 const DCTELEM *p; | |
248 UINT8 *pix; | |
249 | |
250 /* read the pixels */ | |
251 p = block; | |
252 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
253 /* unrolled loop */ |
0 | 254 __asm __volatile( |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
255 "movq %3, %%mm0\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
256 "movq 8%3, %%mm1\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
257 "movq 16%3, %%mm2\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
258 "movq 24%3, %%mm3\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
259 "movq 32%3, %%mm4\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
260 "movq 40%3, %%mm5\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
261 "movq 48%3, %%mm6\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
262 "movq 56%3, %%mm7\n\t" |
0 | 263 "packuswb %%mm1, %%mm0\n\t" |
264 "packuswb %%mm3, %%mm2\n\t" | |
265 "packuswb %%mm5, %%mm4\n\t" | |
266 "packuswb %%mm7, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
267 "movq %%mm0, (%0)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
268 "movq %%mm2, (%0, %1)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
269 "movq %%mm4, (%0, %1, 2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
270 "movq %%mm6, (%0, %2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
271 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) |
0 | 272 :"memory"); |
273 pix += line_size*4; | |
274 p += 32; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
275 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
276 // if here would be an exact copy of the code above |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
277 // compiler would generate some very strange code |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
278 // thus using "r" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
279 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
280 "movq (%3), %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
281 "movq 8(%3), %%mm1\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
282 "movq 16(%3), %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
283 "movq 24(%3), %%mm3\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
284 "movq 32(%3), %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
285 "movq 40(%3), %%mm5\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
286 "movq 48(%3), %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
287 "movq 56(%3), %%mm7\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
288 "packuswb %%mm1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
289 "packuswb %%mm3, %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
290 "packuswb %%mm5, %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
291 "packuswb %%mm7, %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
292 "movq %%mm0, (%0)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
293 "movq %%mm2, (%0, %1)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
294 "movq %%mm4, (%0, %1, 2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
295 "movq %%mm6, (%0, %2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
296 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
297 :"memory"); |
0 | 298 } |
299 | |
300 static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
301 { | |
302 const DCTELEM *p; | |
303 UINT8 *pix; | |
304 int i; | |
305 | |
306 /* read the pixels */ | |
307 p = block; | |
308 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
309 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
310 i = 4; |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
311 do { |
0 | 312 __asm __volatile( |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
313 "movq (%2), %%mm0\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
314 "movq 8(%2), %%mm1\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
315 "movq 16(%2), %%mm2\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
316 "movq 24(%2), %%mm3\n\t" |
0 | 317 "movq %0, %%mm4\n\t" |
318 "movq %1, %%mm6\n\t" | |
319 "movq %%mm4, %%mm5\n\t" | |
320 "punpcklbw %%mm7, %%mm4\n\t" | |
321 "punpckhbw %%mm7, %%mm5\n\t" | |
322 "paddsw %%mm4, %%mm0\n\t" | |
323 "paddsw %%mm5, %%mm1\n\t" | |
324 "movq %%mm6, %%mm5\n\t" | |
325 "punpcklbw %%mm7, %%mm6\n\t" | |
326 "punpckhbw %%mm7, %%mm5\n\t" | |
327 "paddsw %%mm6, %%mm2\n\t" | |
328 "paddsw %%mm5, %%mm3\n\t" | |
329 "packuswb %%mm1, %%mm0\n\t" | |
330 "packuswb %%mm3, %%mm2\n\t" | |
331 "movq %%mm0, %0\n\t" | |
332 "movq %%mm2, %1\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
333 :"+m"(*pix), "+m"(*(pix+line_size)) |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
334 :"r"(p) |
0 | 335 :"memory"); |
336 pix += line_size*2; | |
337 p += 16; | |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
338 } while (--i); |
0 | 339 } |
340 | |
341 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
342 { | |
420 | 343 asm volatile |
344 ( | |
345 "lea (%3, %3), %%eax \n\t" | |
422 | 346 ".balign 8 \n\t" |
420 | 347 "1: \n\t" |
348 "movq (%1), %%mm0 \n\t" | |
349 "movq (%1, %3), %%mm1 \n\t" | |
350 "movq %%mm0, (%2) \n\t" | |
351 "movq %%mm1, (%2, %3) \n\t" | |
352 "addl %%eax, %1 \n\t" | |
353 "addl %%eax, %2 \n\t" | |
354 "movq (%1), %%mm0 \n\t" | |
355 "movq (%1, %3), %%mm1 \n\t" | |
356 "movq %%mm0, (%2) \n\t" | |
357 "movq %%mm1, (%2, %3) \n\t" | |
358 "addl %%eax, %1 \n\t" | |
359 "addl %%eax, %2 \n\t" | |
360 "subl $4, %0 \n\t" | |
361 "jnz 1b \n\t" | |
362 : "+g"(h), "+r" (pixels), "+r" (block) | |
363 : "r"(line_size) | |
364 : "%eax", "memory" | |
365 ); | |
0 | 366 } |
367 | |
368 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
369 { | |
370 UINT8 *p; | |
371 const UINT8 *pix; | |
372 p = block; | |
373 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
374 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
375 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
376 JUMPALIGN(); |
0 | 377 do { |
378 __asm __volatile( | |
379 "movq %0, %%mm0\n\t" | |
380 "movq %1, %%mm1\n\t" | |
381 "movq %%mm0, %%mm2\n\t" | |
382 "movq %%mm1, %%mm3\n\t" | |
383 "punpcklbw %%mm7, %%mm0\n\t" | |
384 "punpcklbw %%mm7, %%mm1\n\t" | |
385 "punpckhbw %%mm7, %%mm2\n\t" | |
386 "punpckhbw %%mm7, %%mm3\n\t" | |
387 "paddusw %%mm1, %%mm0\n\t" | |
388 "paddusw %%mm3, %%mm2\n\t" | |
389 "paddusw %%mm6, %%mm0\n\t" | |
390 "paddusw %%mm6, %%mm2\n\t" | |
391 "psrlw $1, %%mm0\n\t" | |
392 "psrlw $1, %%mm2\n\t" | |
393 "packuswb %%mm2, %%mm0\n\t" | |
394 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
395 :"+m"(*p) |
0 | 396 :"m"(*pix) |
397 :"memory"); | |
398 pix += line_size; | |
399 p += line_size; | |
400 } | |
401 while (--h); | |
402 } | |
403 | |
404 static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
405 { | |
406 UINT8 *p; | |
407 const UINT8 *pix; | |
408 p = block; | |
409 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
410 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
411 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
412 JUMPALIGN(); |
0 | 413 do { |
414 __asm __volatile( | |
415 "movq %1, %%mm1\n\t" | |
416 "movq %0, %%mm0\n\t" | |
417 "movq 1%1, %%mm4\n\t" | |
418 "movq %%mm0, %%mm2\n\t" | |
419 "movq %%mm1, %%mm3\n\t" | |
420 "movq %%mm4, %%mm5\n\t" | |
421 "punpcklbw %%mm7, %%mm1\n\t" | |
422 "punpckhbw %%mm7, %%mm3\n\t" | |
423 "punpcklbw %%mm7, %%mm4\n\t" | |
424 "punpckhbw %%mm7, %%mm5\n\t" | |
425 "punpcklbw %%mm7, %%mm0\n\t" | |
426 "punpckhbw %%mm7, %%mm2\n\t" | |
427 "paddusw %%mm4, %%mm1\n\t" | |
428 "paddusw %%mm5, %%mm3\n\t" | |
429 "paddusw %%mm6, %%mm1\n\t" | |
430 "paddusw %%mm6, %%mm3\n\t" | |
431 "psrlw $1, %%mm1\n\t" | |
432 "psrlw $1, %%mm3\n\t" | |
433 "paddusw %%mm6, %%mm0\n\t" | |
434 "paddusw %%mm6, %%mm2\n\t" | |
435 "paddusw %%mm1, %%mm0\n\t" | |
436 "paddusw %%mm3, %%mm2\n\t" | |
437 "psrlw $1, %%mm0\n\t" | |
438 "psrlw $1, %%mm2\n\t" | |
439 "packuswb %%mm2, %%mm0\n\t" | |
440 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
441 :"+m"(*p) |
0 | 442 :"m"(*pix) |
443 :"memory"); | |
444 pix += line_size; | |
445 p += line_size; | |
446 } while (--h); | |
447 } | |
448 | |
449 static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
450 { | |
451 UINT8 *p; | |
452 const UINT8 *pix; | |
453 p = block; | |
454 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
455 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
456 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
457 JUMPALIGN(); |
0 | 458 do { |
459 __asm __volatile( | |
460 "movq %1, %%mm1\n\t" | |
461 "movq %0, %%mm0\n\t" | |
462 "movq %2, %%mm4\n\t" | |
463 "movq %%mm0, %%mm2\n\t" | |
464 "movq %%mm1, %%mm3\n\t" | |
465 "movq %%mm4, %%mm5\n\t" | |
466 "punpcklbw %%mm7, %%mm1\n\t" | |
467 "punpckhbw %%mm7, %%mm3\n\t" | |
468 "punpcklbw %%mm7, %%mm4\n\t" | |
469 "punpckhbw %%mm7, %%mm5\n\t" | |
470 "punpcklbw %%mm7, %%mm0\n\t" | |
471 "punpckhbw %%mm7, %%mm2\n\t" | |
472 "paddusw %%mm4, %%mm1\n\t" | |
473 "paddusw %%mm5, %%mm3\n\t" | |
474 "paddusw %%mm6, %%mm1\n\t" | |
475 "paddusw %%mm6, %%mm3\n\t" | |
476 "psrlw $1, %%mm1\n\t" | |
477 "psrlw $1, %%mm3\n\t" | |
478 "paddusw %%mm6, %%mm0\n\t" | |
479 "paddusw %%mm6, %%mm2\n\t" | |
480 "paddusw %%mm1, %%mm0\n\t" | |
481 "paddusw %%mm3, %%mm2\n\t" | |
482 "psrlw $1, %%mm0\n\t" | |
483 "psrlw $1, %%mm2\n\t" | |
484 "packuswb %%mm2, %%mm0\n\t" | |
485 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
486 :"+m"(*p) |
0 | 487 :"m"(*pix), "m"(*(pix+line_size)) |
488 :"memory"); | |
489 pix += line_size; | |
490 p += line_size ; | |
491 } while(--h); | |
492 } | |
493 | |
494 static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
495 { | |
496 UINT8 *p; | |
497 const UINT8 *pix; | |
498 p = block; | |
499 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
500 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
501 // this doesn't seem to be used offten - so |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
502 // the inside usage of mm_wone is not optimized |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
503 MOVQ_WTWO(mm6); |
0 | 504 do { |
505 __asm __volatile( | |
506 "movq %1, %%mm0\n\t" | |
507 "movq %2, %%mm1\n\t" | |
508 "movq 1%1, %%mm4\n\t" | |
509 "movq 1%2, %%mm5\n\t" | |
510 "movq %%mm0, %%mm2\n\t" | |
511 "movq %%mm1, %%mm3\n\t" | |
512 "punpcklbw %%mm7, %%mm0\n\t" | |
513 "punpcklbw %%mm7, %%mm1\n\t" | |
514 "punpckhbw %%mm7, %%mm2\n\t" | |
515 "punpckhbw %%mm7, %%mm3\n\t" | |
516 "paddusw %%mm1, %%mm0\n\t" | |
517 "paddusw %%mm3, %%mm2\n\t" | |
518 "movq %%mm4, %%mm1\n\t" | |
519 "movq %%mm5, %%mm3\n\t" | |
520 "punpcklbw %%mm7, %%mm4\n\t" | |
521 "punpcklbw %%mm7, %%mm5\n\t" | |
522 "punpckhbw %%mm7, %%mm1\n\t" | |
523 "punpckhbw %%mm7, %%mm3\n\t" | |
524 "paddusw %%mm5, %%mm4\n\t" | |
525 "paddusw %%mm3, %%mm1\n\t" | |
526 "paddusw %%mm6, %%mm4\n\t" | |
527 "paddusw %%mm6, %%mm1\n\t" | |
528 "paddusw %%mm4, %%mm0\n\t" | |
529 "paddusw %%mm1, %%mm2\n\t" | |
530 "movq %3, %%mm5\n\t" | |
531 "psrlw $2, %%mm0\n\t" | |
532 "movq %0, %%mm1\n\t" | |
533 "psrlw $2, %%mm2\n\t" | |
534 "movq %%mm1, %%mm3\n\t" | |
535 "punpcklbw %%mm7, %%mm1\n\t" | |
536 "punpckhbw %%mm7, %%mm3\n\t" | |
537 "paddusw %%mm1, %%mm0\n\t" | |
538 "paddusw %%mm3, %%mm2\n\t" | |
539 "paddusw %%mm5, %%mm0\n\t" | |
540 "paddusw %%mm5, %%mm2\n\t" | |
541 "psrlw $1, %%mm0\n\t" | |
542 "psrlw $1, %%mm2\n\t" | |
543 "packuswb %%mm2, %%mm0\n\t" | |
544 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
545 :"+m"(*p) |
0 | 546 :"m"(*pix), |
8 | 547 "m"(*(pix+line_size)), "m"(mm_wone) |
0 | 548 :"memory"); |
549 pix += line_size; | |
550 p += line_size ; | |
551 } while(--h); | |
552 } | |
553 | |
554 static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
555 { | |
556 UINT8 *p; | |
557 const UINT8 *pix; | |
558 p = block; | |
559 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
560 MOVQ_ZERO(mm7); |
0 | 561 do { |
562 __asm __volatile( | |
563 "movq %1, %%mm0\n\t" | |
564 "movq %0, %%mm1\n\t" | |
565 "movq %%mm0, %%mm2\n\t" | |
566 "movq %%mm1, %%mm3\n\t" | |
567 "punpcklbw %%mm7, %%mm0\n\t" | |
568 "punpcklbw %%mm7, %%mm1\n\t" | |
569 "punpckhbw %%mm7, %%mm2\n\t" | |
570 "punpckhbw %%mm7, %%mm3\n\t" | |
571 "paddusw %%mm1, %%mm0\n\t" | |
572 "paddusw %%mm3, %%mm2\n\t" | |
573 "psrlw $1, %%mm0\n\t" | |
574 "psrlw $1, %%mm2\n\t" | |
575 "packuswb %%mm2, %%mm0\n\t" | |
576 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
577 :"+m"(*p) |
0 | 578 :"m"(*pix) |
579 :"memory"); | |
580 pix += line_size; | |
581 p += line_size ; | |
582 } while (--h); | |
583 } | |
584 | |
585 static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
586 { | |
587 UINT8 *p; | |
588 const UINT8 *pix; | |
589 p = block; | |
590 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
591 MOVQ_ZERO(mm7); |
0 | 592 do { |
593 __asm __volatile( | |
594 "movq %1, %%mm0\n\t" | |
595 "movq 1%1, %%mm1\n\t" | |
596 "movq %0, %%mm4\n\t" | |
597 "movq %%mm0, %%mm2\n\t" | |
598 "movq %%mm1, %%mm3\n\t" | |
599 "movq %%mm4, %%mm5\n\t" | |
600 "punpcklbw %%mm7, %%mm0\n\t" | |
601 "punpcklbw %%mm7, %%mm1\n\t" | |
602 "punpckhbw %%mm7, %%mm2\n\t" | |
603 "punpckhbw %%mm7, %%mm3\n\t" | |
604 "punpcklbw %%mm7, %%mm4\n\t" | |
605 "punpckhbw %%mm7, %%mm5\n\t" | |
606 "paddusw %%mm1, %%mm0\n\t" | |
607 "paddusw %%mm3, %%mm2\n\t" | |
608 "psrlw $1, %%mm0\n\t" | |
609 "psrlw $1, %%mm2\n\t" | |
610 "paddusw %%mm4, %%mm0\n\t" | |
611 "paddusw %%mm5, %%mm2\n\t" | |
612 "psrlw $1, %%mm0\n\t" | |
613 "psrlw $1, %%mm2\n\t" | |
614 "packuswb %%mm2, %%mm0\n\t" | |
615 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
616 :"+m"(*p) |
0 | 617 :"m"(*pix) |
618 :"memory"); | |
619 pix += line_size; | |
620 p += line_size; | |
621 } while (--h); | |
622 } | |
623 | |
624 static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
625 { | |
626 UINT8 *p; | |
627 const UINT8 *pix; | |
628 p = block; | |
629 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
630 MOVQ_ZERO(mm7); |
0 | 631 do { |
632 __asm __volatile( | |
633 "movq %1, %%mm0\n\t" | |
634 "movq %2, %%mm1\n\t" | |
635 "movq %0, %%mm4\n\t" | |
636 "movq %%mm0, %%mm2\n\t" | |
637 "movq %%mm1, %%mm3\n\t" | |
638 "movq %%mm4, %%mm5\n\t" | |
639 "punpcklbw %%mm7, %%mm0\n\t" | |
640 "punpcklbw %%mm7, %%mm1\n\t" | |
641 "punpckhbw %%mm7, %%mm2\n\t" | |
642 "punpckhbw %%mm7, %%mm3\n\t" | |
643 "punpcklbw %%mm7, %%mm4\n\t" | |
644 "punpckhbw %%mm7, %%mm5\n\t" | |
645 "paddusw %%mm1, %%mm0\n\t" | |
646 "paddusw %%mm3, %%mm2\n\t" | |
647 "psrlw $1, %%mm0\n\t" | |
648 "psrlw $1, %%mm2\n\t" | |
649 "paddusw %%mm4, %%mm0\n\t" | |
650 "paddusw %%mm5, %%mm2\n\t" | |
651 "psrlw $1, %%mm0\n\t" | |
652 "psrlw $1, %%mm2\n\t" | |
653 "packuswb %%mm2, %%mm0\n\t" | |
654 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
655 :"+m"(*p) |
0 | 656 :"m"(*pix), "m"(*(pix+line_size)) |
657 :"memory"); | |
658 pix += line_size; | |
659 p += line_size ; | |
660 } while(--h); | |
661 } | |
662 | |
663 static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
664 { | |
665 UINT8 *p; | |
666 const UINT8 *pix; | |
667 p = block; | |
668 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
669 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
670 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
671 JUMPALIGN(); |
0 | 672 do { |
673 __asm __volatile( | |
674 "movq %1, %%mm0\n\t" | |
675 "movq %2, %%mm1\n\t" | |
676 "movq 1%1, %%mm4\n\t" | |
677 "movq 1%2, %%mm5\n\t" | |
678 "movq %%mm0, %%mm2\n\t" | |
679 "movq %%mm1, %%mm3\n\t" | |
680 "punpcklbw %%mm7, %%mm0\n\t" | |
681 "punpcklbw %%mm7, %%mm1\n\t" | |
682 "punpckhbw %%mm7, %%mm2\n\t" | |
683 "punpckhbw %%mm7, %%mm3\n\t" | |
684 "paddusw %%mm1, %%mm0\n\t" | |
685 "paddusw %%mm3, %%mm2\n\t" | |
686 "movq %%mm4, %%mm1\n\t" | |
687 "movq %%mm5, %%mm3\n\t" | |
688 "punpcklbw %%mm7, %%mm4\n\t" | |
689 "punpcklbw %%mm7, %%mm5\n\t" | |
690 "punpckhbw %%mm7, %%mm1\n\t" | |
691 "punpckhbw %%mm7, %%mm3\n\t" | |
692 "paddusw %%mm5, %%mm4\n\t" | |
693 "paddusw %%mm3, %%mm1\n\t" | |
694 "paddusw %%mm6, %%mm4\n\t" | |
695 "paddusw %%mm6, %%mm1\n\t" | |
696 "paddusw %%mm4, %%mm0\n\t" | |
697 "paddusw %%mm1, %%mm2\n\t" | |
698 "movq %0, %%mm1\n\t" | |
699 "psrlw $2, %%mm0\n\t" | |
700 "movq %%mm1, %%mm3\n\t" | |
701 "psrlw $2, %%mm2\n\t" | |
702 "punpcklbw %%mm7, %%mm1\n\t" | |
703 "punpckhbw %%mm7, %%mm3\n\t" | |
704 "paddusw %%mm1, %%mm0\n\t" | |
705 "paddusw %%mm3, %%mm2\n\t" | |
706 "psrlw $1, %%mm0\n\t" | |
707 "psrlw $1, %%mm2\n\t" | |
708 "packuswb %%mm2, %%mm0\n\t" | |
709 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
710 :"+m"(*p) |
0 | 711 :"m"(*pix), |
712 "m"(*(pix+line_size)) | |
713 :"memory"); | |
714 pix += line_size; | |
715 p += line_size; | |
716 } while(--h); | |
717 } | |
718 | |
296 | 719 static void clear_blocks_mmx(DCTELEM *blocks) |
720 { | |
721 asm volatile( | |
722 "pxor %%mm7, %%mm7 \n\t" | |
723 "movl $-128*6, %%eax \n\t" | |
724 "1: \n\t" | |
725 "movq %%mm7, (%0, %%eax) \n\t" | |
726 "movq %%mm7, 8(%0, %%eax) \n\t" | |
727 "movq %%mm7, 16(%0, %%eax) \n\t" | |
728 "movq %%mm7, 24(%0, %%eax) \n\t" | |
729 "addl $32, %%eax \n\t" | |
730 " js 1b \n\t" | |
731 : : "r" (((int)blocks)+128*6) | |
732 : "%eax" | |
733 ); | |
734 } | |
735 | |
393 | 736 #if 0 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
737 static void just_return() { return; } |
393 | 738 #endif |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
739 |
0 | 740 void dsputil_init_mmx(void) |
741 { | |
742 mm_flags = mm_support(); | |
188 | 743 #if 1 |
744 printf("libavcodec: CPU flags:"); | |
0 | 745 if (mm_flags & MM_MMX) |
746 printf(" mmx"); | |
747 if (mm_flags & MM_MMXEXT) | |
748 printf(" mmxext"); | |
749 if (mm_flags & MM_3DNOW) | |
750 printf(" 3dnow"); | |
751 if (mm_flags & MM_SSE) | |
752 printf(" sse"); | |
753 if (mm_flags & MM_SSE2) | |
754 printf(" sse2"); | |
755 printf("\n"); | |
756 #endif | |
757 | |
758 if (mm_flags & MM_MMX) { | |
759 get_pixels = get_pixels_mmx; | |
324 | 760 diff_pixels = diff_pixels_mmx; |
0 | 761 put_pixels_clamped = put_pixels_clamped_mmx; |
762 add_pixels_clamped = add_pixels_clamped_mmx; | |
296 | 763 clear_blocks= clear_blocks_mmx; |
415 | 764 |
294 | 765 pix_abs16x16 = pix_abs16x16_mmx; |
766 pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
767 pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
0 | 768 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; |
294 | 769 pix_abs8x8 = pix_abs8x8_mmx; |
770 pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | |
771 pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | |
772 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | |
0 | 773 av_fdct = fdct_mmx; |
774 | |
775 put_pixels_tab[0] = put_pixels_mmx; | |
776 put_pixels_tab[1] = put_pixels_x2_mmx; | |
777 put_pixels_tab[2] = put_pixels_y2_mmx; | |
778 put_pixels_tab[3] = put_pixels_xy2_mmx; | |
779 | |
780 put_no_rnd_pixels_tab[0] = put_pixels_mmx; | |
781 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
782 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
783 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | |
415 | 784 |
0 | 785 avg_pixels_tab[0] = avg_pixels_mmx; |
786 avg_pixels_tab[1] = avg_pixels_x2_mmx; | |
787 avg_pixels_tab[2] = avg_pixels_y2_mmx; | |
788 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
789 | |
790 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | |
791 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | |
792 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | |
793 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | |
386 | 794 |
0 | 795 if (mm_flags & MM_MMXEXT) { |
294 | 796 pix_abs16x16 = pix_abs16x16_mmx2; |
797 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; | |
798 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; | |
799 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2; | |
415 | 800 |
294 | 801 pix_abs8x8 = pix_abs8x8_mmx2; |
802 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; | |
803 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; | |
804 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; | |
386 | 805 |
806 put_pixels_tab[1] = put_pixels_x2_mmx2; | |
807 put_pixels_tab[2] = put_pixels_y2_mmx2; | |
808 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2; | |
809 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2; | |
415 | 810 |
386 | 811 avg_pixels_tab[0] = avg_pixels_mmx2; |
812 avg_pixels_tab[1] = avg_pixels_x2_mmx2; | |
813 avg_pixels_tab[2] = avg_pixels_y2_mmx2; | |
814 avg_pixels_tab[3] = avg_pixels_xy2_mmx2; | |
0 | 815 } else if (mm_flags & MM_3DNOW) { |
816 put_pixels_tab[1] = put_pixels_x2_3dnow; | |
817 put_pixels_tab[2] = put_pixels_y2_3dnow; | |
386 | 818 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow; |
819 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow; | |
393 | 820 |
0 | 821 avg_pixels_tab[0] = avg_pixels_3dnow; |
822 avg_pixels_tab[1] = avg_pixels_x2_3dnow; | |
823 avg_pixels_tab[2] = avg_pixels_y2_3dnow; | |
824 avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | |
825 } | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
826 |
42 | 827 /* idct */ |
828 if (mm_flags & MM_MMXEXT) { | |
829 ff_idct = ff_mmxext_idct; | |
830 } else { | |
831 ff_idct = ff_mmx_idct; | |
832 } | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
833 #ifdef SIMPLE_IDCT |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
834 // ff_idct = simple_idct; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
835 ff_idct = simple_idct_mmx; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
836 #endif |
0 | 837 } |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
838 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
839 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
840 // for speed testing |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
841 get_pixels = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
842 put_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
843 add_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
844 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
845 pix_abs16x16 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
846 pix_abs16x16_x2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
847 pix_abs16x16_y2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
848 pix_abs16x16_xy2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
849 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
850 put_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
851 put_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
852 put_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
853 put_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
854 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
855 put_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
856 put_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
857 put_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
858 put_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
859 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
860 avg_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
861 avg_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
862 avg_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
863 avg_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
864 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
865 avg_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
866 avg_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
867 avg_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
868 avg_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
869 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
870 //av_fdct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
871 //ff_idct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
872 #endif |
0 | 873 } |
402 | 874 |
875 /* remove any non bit exact operation (testing purpose). NOTE that | |
876 this function should be kept as small as possible because it is | |
877 always difficult to test automatically non bit exact cases. */ | |
878 void dsputil_set_bit_exact_mmx(void) | |
879 { | |
880 if (mm_flags & MM_MMX) { | |
881 if (mm_flags & MM_MMXEXT) { | |
882 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
883 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
884 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
885 } else if (mm_flags & MM_3DNOW) { | |
886 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
887 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
888 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
889 } | |
890 } | |
891 } |