Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 444:a5edef76dac6 libavcodec
* new mmx code - based upon http://aggregate.org/MAGIC
for now it's rather sneak preview (new functions are nearly 100% faster)
author | kabi |
---|---|
date | Wed, 29 May 2002 14:29:48 +0000 |
parents | fe58fe638f9b |
children | 62c01dbdc1e0 |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
0 | 4 * |
429 | 5 * This library is free software; you can redistribute it and/or |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2 of the License, or (at your option) any later version. | |
0 | 9 * |
429 | 10 * This library is distributed in the hope that it will be useful, |
0 | 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 * Lesser General Public License for more details. | |
0 | 14 * |
429 | 15 * You should have received a copy of the GNU Lesser General Public |
16 * License along with this library; if not, write to the Free Software | |
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
0 | 18 * |
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 */ | |
21 | |
22 #include "../dsputil.h" | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
23 #include "../simple_idct.h" |
0 | 24 |
5 | 25 int mm_flags; /* multimedia extension flags */ |
26 | |
294 | 27 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx); |
28 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
29 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
30 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
31 | |
32 int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
33 int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
34 int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
35 int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
36 | |
37 int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
38 int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
39 int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
40 int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
41 | |
42 int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
43 int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
44 int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
45 int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
46 | |
42 | 47 /* external functions, from idct_mmx.c */ |
48 void ff_mmx_idct(DCTELEM *block); | |
49 void ff_mmxext_idct(DCTELEM *block); | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
50 |
0 | 51 /* pixel operations */ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
52 static const uint64_t mm_bfe __attribute__ ((aligned(8))) = 0xfefefefefefefefeULL; |
387 | 53 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; |
54 static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | |
55 static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL; | |
8 | 56 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; |
57 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |
0 | 58 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
59 #define JUMPALIGN() __asm __volatile (".balign 8"::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
60 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
61 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
62 #ifndef PIC |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
63 #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
64 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) |
387 | 65 #define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t" |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
66 #define MOVQ_BFE(regd) "movq "MANGLE(mm_bfe)", "#regd" \n\t" |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
67 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
68 // for shared library it's better to use this way for accessing constants |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
69 // pcmpeqd -> -1 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
70 #define MOVQ_WONE(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
71 __asm __volatile ( \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
72 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
73 "psrlw $15, %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
74 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
75 #define MOVQ_WTWO(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
76 __asm __volatile ( \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
77 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
78 "psrlw $15, %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
79 "psllw $1, %%" #regd ::) |
387 | 80 |
81 #define MOVQ_BONE(regd) \ | |
82 "pcmpeqd " #regd ", " #regd " \n\t" \ | |
83 "psrlw $15, " #regd " \n\t"\ | |
84 "packuswb " #regd ", " #regd " \n\t" | |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
85 |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
86 #define MOVQ_BFE(regd) \ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
87 "pcmpeqd " #regd ", " #regd " \n\t"\ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
88 "paddb " #regd ", " #regd " \n\t" |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
89 #endif |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
90 |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
91 // using mm6 as temporary and for the output result |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
92 // first argument is unmodifed and second is trashed |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
93 // mm7 is supposed to contain 0xfefefefefefefefe |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
94 #define PAVG_MMX_NO_RND(rega, regb) \ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
95 "movq " #rega ", %%mm6 \n\t"\ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
96 "pand " #regb ", %%mm6 \n\t"\ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
97 "pxor " #rega ", " #regb " \n\t"\ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
98 "pand %%mm7, " #regb " \n\t"\ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
99 "psrlq $1, " #regb " \n\t"\ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
100 "paddb " #regb ", %%mm6 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
101 |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
102 #define PAVG_MMX(rega, regb) \ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
103 "movq " #rega ", %%mm6 \n\t"\ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
104 "por " #regb ", %%mm6 \n\t"\ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
105 "pxor " #rega ", " #regb " \n\t"\ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
106 "pand %%mm7, " #regb " \n\t"\ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
107 "psrlq $1, " #regb " \n\t"\ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
108 "psubb " #regb ", %%mm6 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
109 |
387 | 110 |
0 | 111 /***********************************/ |
112 /* 3Dnow specific */ | |
113 | |
114 #define DEF(x) x ## _3dnow | |
115 /* for Athlons PAVGUSB is prefered */ | |
116 #define PAVGB "pavgusb" | |
117 | |
118 #include "dsputil_mmx_avg.h" | |
119 | |
120 #undef DEF | |
121 #undef PAVGB | |
122 | |
123 /***********************************/ | |
124 /* MMX2 specific */ | |
125 | |
386 | 126 #define DEF(x) x ## _mmx2 |
0 | 127 |
128 /* Introduced only in MMX2 set */ | |
129 #define PAVGB "pavgb" | |
130 | |
131 #include "dsputil_mmx_avg.h" | |
132 | |
133 #undef DEF | |
134 #undef PAVGB | |
135 | |
136 /***********************************/ | |
137 /* standard MMX */ | |
138 | |
139 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
140 { | |
386 | 141 asm volatile( |
142 "movl $-128, %%eax \n\t" | |
143 "pxor %%mm7, %%mm7 \n\t" | |
144 ".balign 16 \n\t" | |
145 "1: \n\t" | |
146 "movq (%0), %%mm0 \n\t" | |
147 "movq (%0, %2), %%mm2 \n\t" | |
148 "movq %%mm0, %%mm1 \n\t" | |
149 "movq %%mm2, %%mm3 \n\t" | |
150 "punpcklbw %%mm7, %%mm0 \n\t" | |
151 "punpckhbw %%mm7, %%mm1 \n\t" | |
152 "punpcklbw %%mm7, %%mm2 \n\t" | |
153 "punpckhbw %%mm7, %%mm3 \n\t" | |
154 "movq %%mm0, (%1, %%eax)\n\t" | |
155 "movq %%mm1, 8(%1, %%eax)\n\t" | |
156 "movq %%mm2, 16(%1, %%eax)\n\t" | |
157 "movq %%mm3, 24(%1, %%eax)\n\t" | |
158 "addl %3, %0 \n\t" | |
159 "addl $32, %%eax \n\t" | |
160 "js 1b \n\t" | |
161 : "+r" (pixels) | |
162 : "r" (block+64), "r" (line_size), "r" (line_size*2) | |
163 : "%eax" | |
164 ); | |
0 | 165 } |
166 | |
324 | 167 static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) |
168 { | |
169 asm volatile( | |
386 | 170 "pxor %%mm7, %%mm7 \n\t" |
171 "movl $-128, %%eax \n\t" | |
324 | 172 ".balign 16 \n\t" |
173 "1: \n\t" | |
174 "movq (%0), %%mm0 \n\t" | |
175 "movq (%1), %%mm2 \n\t" | |
176 "movq %%mm0, %%mm1 \n\t" | |
177 "movq %%mm2, %%mm3 \n\t" | |
178 "punpcklbw %%mm7, %%mm0 \n\t" | |
179 "punpckhbw %%mm7, %%mm1 \n\t" | |
180 "punpcklbw %%mm7, %%mm2 \n\t" | |
181 "punpckhbw %%mm7, %%mm3 \n\t" | |
182 "psubw %%mm2, %%mm0 \n\t" | |
183 "psubw %%mm3, %%mm1 \n\t" | |
184 "movq %%mm0, (%2, %%eax)\n\t" | |
185 "movq %%mm1, 8(%2, %%eax)\n\t" | |
186 "addl %3, %0 \n\t" | |
187 "addl %3, %1 \n\t" | |
188 "addl $16, %%eax \n\t" | |
189 "jnz 1b \n\t" | |
190 : "+r" (s1), "+r" (s2) | |
191 : "r" (block+64), "r" (stride) | |
192 : "%eax" | |
193 ); | |
194 } | |
195 | |
0 | 196 static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) |
197 { | |
198 const DCTELEM *p; | |
199 UINT8 *pix; | |
200 | |
201 /* read the pixels */ | |
202 p = block; | |
203 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
204 /* unrolled loop */ |
0 | 205 __asm __volatile( |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
206 "movq %3, %%mm0\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
207 "movq 8%3, %%mm1\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
208 "movq 16%3, %%mm2\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
209 "movq 24%3, %%mm3\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
210 "movq 32%3, %%mm4\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
211 "movq 40%3, %%mm5\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
212 "movq 48%3, %%mm6\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
213 "movq 56%3, %%mm7\n\t" |
0 | 214 "packuswb %%mm1, %%mm0\n\t" |
215 "packuswb %%mm3, %%mm2\n\t" | |
216 "packuswb %%mm5, %%mm4\n\t" | |
217 "packuswb %%mm7, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
218 "movq %%mm0, (%0)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
219 "movq %%mm2, (%0, %1)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
220 "movq %%mm4, (%0, %1, 2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
221 "movq %%mm6, (%0, %2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
222 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) |
0 | 223 :"memory"); |
224 pix += line_size*4; | |
225 p += 32; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
226 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
227 // if here would be an exact copy of the code above |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
228 // compiler would generate some very strange code |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
229 // thus using "r" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
230 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
231 "movq (%3), %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
232 "movq 8(%3), %%mm1\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
233 "movq 16(%3), %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
234 "movq 24(%3), %%mm3\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
235 "movq 32(%3), %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
236 "movq 40(%3), %%mm5\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
237 "movq 48(%3), %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
238 "movq 56(%3), %%mm7\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
239 "packuswb %%mm1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
240 "packuswb %%mm3, %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
241 "packuswb %%mm5, %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
242 "packuswb %%mm7, %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
243 "movq %%mm0, (%0)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
244 "movq %%mm2, (%0, %1)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
245 "movq %%mm4, (%0, %1, 2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
246 "movq %%mm6, (%0, %2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
247 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
248 :"memory"); |
0 | 249 } |
250 | |
251 static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
252 { | |
253 const DCTELEM *p; | |
254 UINT8 *pix; | |
255 int i; | |
256 | |
257 /* read the pixels */ | |
258 p = block; | |
259 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
260 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
261 i = 4; |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
262 do { |
0 | 263 __asm __volatile( |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
264 "movq (%2), %%mm0\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
265 "movq 8(%2), %%mm1\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
266 "movq 16(%2), %%mm2\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
267 "movq 24(%2), %%mm3\n\t" |
0 | 268 "movq %0, %%mm4\n\t" |
269 "movq %1, %%mm6\n\t" | |
270 "movq %%mm4, %%mm5\n\t" | |
271 "punpcklbw %%mm7, %%mm4\n\t" | |
272 "punpckhbw %%mm7, %%mm5\n\t" | |
273 "paddsw %%mm4, %%mm0\n\t" | |
274 "paddsw %%mm5, %%mm1\n\t" | |
275 "movq %%mm6, %%mm5\n\t" | |
276 "punpcklbw %%mm7, %%mm6\n\t" | |
277 "punpckhbw %%mm7, %%mm5\n\t" | |
278 "paddsw %%mm6, %%mm2\n\t" | |
279 "paddsw %%mm5, %%mm3\n\t" | |
280 "packuswb %%mm1, %%mm0\n\t" | |
281 "packuswb %%mm3, %%mm2\n\t" | |
282 "movq %%mm0, %0\n\t" | |
283 "movq %%mm2, %1\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
284 :"+m"(*pix), "+m"(*(pix+line_size)) |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
285 :"r"(p) |
0 | 286 :"memory"); |
287 pix += line_size*2; | |
288 p += 16; | |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
289 } while (--i); |
0 | 290 } |
291 | |
292 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
293 { | |
420 | 294 asm volatile |
295 ( | |
296 "lea (%3, %3), %%eax \n\t" | |
422 | 297 ".balign 8 \n\t" |
420 | 298 "1: \n\t" |
299 "movq (%1), %%mm0 \n\t" | |
300 "movq (%1, %3), %%mm1 \n\t" | |
301 "movq %%mm0, (%2) \n\t" | |
302 "movq %%mm1, (%2, %3) \n\t" | |
303 "addl %%eax, %1 \n\t" | |
304 "addl %%eax, %2 \n\t" | |
305 "movq (%1), %%mm0 \n\t" | |
306 "movq (%1, %3), %%mm1 \n\t" | |
307 "movq %%mm0, (%2) \n\t" | |
308 "movq %%mm1, (%2, %3) \n\t" | |
309 "addl %%eax, %1 \n\t" | |
310 "addl %%eax, %2 \n\t" | |
311 "subl $4, %0 \n\t" | |
312 "jnz 1b \n\t" | |
313 : "+g"(h), "+r" (pixels), "+r" (block) | |
314 : "r"(line_size) | |
315 : "%eax", "memory" | |
316 ); | |
0 | 317 } |
318 | |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
319 // will have to be check if it's better to have bigger |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
320 // unrolled code also on Celerons - for now yes |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
321 #define LONG_UNROLL 1 |
0 | 322 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
323 { | |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
324 #if 0 |
0 | 325 UINT8 *p; |
326 const UINT8 *pix; | |
327 p = block; | |
328 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
329 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
330 MOVQ_WONE(mm4); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
331 JUMPALIGN(); |
0 | 332 do { |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
333 __asm __volatile( |
0 | 334 "movq %1, %%mm0\n\t" |
335 "movq 1%1, %%mm1\n\t" | |
336 "movq %%mm0, %%mm2\n\t" | |
337 "movq %%mm1, %%mm3\n\t" | |
338 "punpcklbw %%mm7, %%mm0\n\t" | |
339 "punpcklbw %%mm7, %%mm1\n\t" | |
340 "punpckhbw %%mm7, %%mm2\n\t" | |
341 "punpckhbw %%mm7, %%mm3\n\t" | |
342 "paddusw %%mm1, %%mm0\n\t" | |
343 "paddusw %%mm3, %%mm2\n\t" | |
344 "paddusw %%mm4, %%mm0\n\t" | |
345 "paddusw %%mm4, %%mm2\n\t" | |
346 "psrlw $1, %%mm0\n\t" | |
347 "psrlw $1, %%mm2\n\t" | |
348 "packuswb %%mm2, %%mm0\n\t" | |
349 "movq %%mm0, %0\n\t" | |
350 :"=m"(*p) | |
351 :"m"(*pix) | |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
352 :"memory"); |
0 | 353 pix += line_size; p += line_size; |
354 } while (--h); | |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
355 #else |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
356 __asm __volatile( |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
357 MOVQ_BFE(%%mm7) |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
358 "lea (%3, %3), %%eax \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
359 ".balign 8 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
360 "1: \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
361 "movq (%1), %%mm0 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
362 "movq (%1, %3), %%mm2 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
363 "movq 1(%1), %%mm1 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
364 "movq 1(%1, %3), %%mm3 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
365 PAVG_MMX(%%mm0, %%mm1) |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
366 "movq %%mm6, (%2) \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
367 PAVG_MMX(%%mm2, %%mm3) |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
368 "movq %%mm6, (%2, %3) \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
369 "addl %%eax, %1 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
370 "addl %%eax, %2 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
371 #if LONG_UNROLL |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
372 "movq (%1), %%mm0 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
373 "movq (%1, %3), %%mm2 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
374 "movq 1(%1), %%mm1 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
375 "movq 1(%1, %3), %%mm3 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
376 PAVG_MMX(%%mm0, %%mm1) |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
377 "movq %%mm6, (%2) \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
378 PAVG_MMX(%%mm2, %%mm3) |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
379 "movq %%mm6, (%2, %3) \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
380 "addl %%eax, %1 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
381 "addl %%eax, %2 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
382 "subl $4, %0 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
383 #else |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
384 "subl $2, %0 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
385 #endif |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
386 "jnz 1b \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
387 :"+g"(h), "+S"(pixels), "+D"(block) |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
388 :"r"(line_size) |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
389 :"eax", "memory"); |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
390 #endif |
0 | 391 } |
392 | |
393 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
394 { | |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
395 #if 0 |
0 | 396 UINT8 *p; |
397 const UINT8 *pix; | |
398 p = block; | |
399 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
400 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
401 MOVQ_WONE(mm4); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
402 JUMPALIGN(); |
0 | 403 do { |
404 __asm __volatile( | |
405 "movq %1, %%mm0\n\t" | |
406 "movq %2, %%mm1\n\t" | |
407 "movq %%mm0, %%mm2\n\t" | |
408 "movq %%mm1, %%mm3\n\t" | |
409 "punpcklbw %%mm7, %%mm0\n\t" | |
410 "punpcklbw %%mm7, %%mm1\n\t" | |
411 "punpckhbw %%mm7, %%mm2\n\t" | |
412 "punpckhbw %%mm7, %%mm3\n\t" | |
413 "paddusw %%mm1, %%mm0\n\t" | |
414 "paddusw %%mm3, %%mm2\n\t" | |
415 "paddusw %%mm4, %%mm0\n\t" | |
416 "paddusw %%mm4, %%mm2\n\t" | |
417 "psrlw $1, %%mm0\n\t" | |
418 "psrlw $1, %%mm2\n\t" | |
419 "packuswb %%mm2, %%mm0\n\t" | |
420 "movq %%mm0, %0\n\t" | |
421 :"=m"(*p) | |
422 :"m"(*pix), | |
423 "m"(*(pix+line_size)) | |
424 :"memory"); | |
425 pix += line_size; | |
426 p += line_size; | |
427 } while (--h); | |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
428 #else |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
429 __asm __volatile( |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
430 MOVQ_BFE(%%mm7) |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
431 "lea (%3, %3), %%eax \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
432 "movq (%1), %%mm0 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
433 ".balign 8 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
434 "1: \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
435 "movq (%1, %3), %%mm1 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
436 "movq (%1, %%eax),%%mm2 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
437 PAVG_MMX(%%mm1, %%mm0) |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
438 "movq %%mm6, (%2) \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
439 PAVG_MMX(%%mm2, %%mm1) |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
440 "movq %%mm6, (%2, %3) \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
441 "addl %%eax, %1 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
442 "addl %%eax, %2 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
443 #ifdef LONG_UNROLL |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
444 "movq (%1, %3), %%mm1 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
445 "movq (%1, %%eax),%%mm0 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
446 PAVG_MMX(%%mm1, %%mm2) |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
447 "movq %%mm6, (%2) \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
448 PAVG_MMX(%%mm0, %%mm1) |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
449 "movq %%mm6, (%2, %3) \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
450 "addl %%eax, %1 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
451 "addl %%eax, %2 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
452 "subl $4, %0 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
453 #else |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
454 "subl $2, %0 \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
455 #endif |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
456 "jnz 1b \n\t" |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
457 :"+g"(h), "+S"(pixels), "+D"(block) |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
458 :"r"(line_size) |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
459 :"eax", "memory"); |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
460 #endif |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
461 |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
462 |
0 | 463 } |
464 | |
465 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
466 { | |
467 UINT8 *p; | |
468 const UINT8 *pix; | |
469 p = block; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
470 pix = pixels; // 1s |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
471 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
472 MOVQ_WTWO(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
473 JUMPALIGN(); |
0 | 474 do { |
475 __asm __volatile( | |
476 "movq %1, %%mm0\n\t" | |
477 "movq %2, %%mm1\n\t" | |
478 "movq 1%1, %%mm4\n\t" | |
479 "movq 1%2, %%mm5\n\t" | |
480 "movq %%mm0, %%mm2\n\t" | |
481 "movq %%mm1, %%mm3\n\t" | |
482 "punpcklbw %%mm7, %%mm0\n\t" | |
483 "punpcklbw %%mm7, %%mm1\n\t" | |
484 "punpckhbw %%mm7, %%mm2\n\t" | |
485 "punpckhbw %%mm7, %%mm3\n\t" | |
486 "paddusw %%mm1, %%mm0\n\t" | |
487 "paddusw %%mm3, %%mm2\n\t" | |
488 "movq %%mm4, %%mm1\n\t" | |
489 "movq %%mm5, %%mm3\n\t" | |
490 "punpcklbw %%mm7, %%mm4\n\t" | |
491 "punpcklbw %%mm7, %%mm5\n\t" | |
492 "punpckhbw %%mm7, %%mm1\n\t" | |
493 "punpckhbw %%mm7, %%mm3\n\t" | |
494 "paddusw %%mm5, %%mm4\n\t" | |
495 "paddusw %%mm3, %%mm1\n\t" | |
496 "paddusw %%mm6, %%mm4\n\t" | |
497 "paddusw %%mm6, %%mm1\n\t" | |
498 "paddusw %%mm4, %%mm0\n\t" | |
499 "paddusw %%mm1, %%mm2\n\t" | |
500 "psrlw $2, %%mm0\n\t" | |
501 "psrlw $2, %%mm2\n\t" | |
502 "packuswb %%mm2, %%mm0\n\t" | |
503 "movq %%mm0, %0\n\t" | |
504 :"=m"(*p) | |
505 :"m"(*pix), | |
506 "m"(*(pix+line_size)) | |
507 :"memory"); | |
508 pix += line_size; | |
509 p += line_size; | |
510 } while(--h); | |
511 } | |
512 | |
513 static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
514 { | |
515 UINT8 *p; | |
516 const UINT8 *pix; | |
517 p = block; | |
518 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
519 MOVQ_ZERO(mm7); |
0 | 520 do { |
521 __asm __volatile( | |
522 "movq %1, %%mm0\n\t" | |
523 "movq 1%1, %%mm1\n\t" | |
524 "movq %%mm0, %%mm2\n\t" | |
525 "movq %%mm1, %%mm3\n\t" | |
526 "punpcklbw %%mm7, %%mm0\n\t" | |
527 "punpcklbw %%mm7, %%mm1\n\t" | |
528 "punpckhbw %%mm7, %%mm2\n\t" | |
529 "punpckhbw %%mm7, %%mm3\n\t" | |
530 "paddusw %%mm1, %%mm0\n\t" | |
531 "paddusw %%mm3, %%mm2\n\t" | |
532 "psrlw $1, %%mm0\n\t" | |
533 "psrlw $1, %%mm2\n\t" | |
534 "packuswb %%mm2, %%mm0\n\t" | |
535 "movq %%mm0, %0\n\t" | |
536 :"=m"(*p) | |
537 :"m"(*pix) | |
538 :"memory"); | |
539 pix += line_size; | |
540 p += line_size; | |
541 } while (--h); | |
542 } | |
543 | |
544 static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
545 { | |
546 UINT8 *p; | |
547 const UINT8 *pix; | |
548 p = block; | |
549 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
550 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
551 JUMPALIGN(); |
0 | 552 do { |
553 __asm __volatile( | |
554 "movq %1, %%mm0\n\t" | |
555 "movq %2, %%mm1\n\t" | |
556 "movq %%mm0, %%mm2\n\t" | |
557 "movq %%mm1, %%mm3\n\t" | |
558 "punpcklbw %%mm7, %%mm0\n\t" | |
559 "punpcklbw %%mm7, %%mm1\n\t" | |
560 "punpckhbw %%mm7, %%mm2\n\t" | |
561 "punpckhbw %%mm7, %%mm3\n\t" | |
562 "paddusw %%mm1, %%mm0\n\t" | |
563 "paddusw %%mm3, %%mm2\n\t" | |
564 "psrlw $1, %%mm0\n\t" | |
565 "psrlw $1, %%mm2\n\t" | |
566 "packuswb %%mm2, %%mm0\n\t" | |
567 "movq %%mm0, %0\n\t" | |
568 :"=m"(*p) | |
569 :"m"(*pix), | |
570 "m"(*(pix+line_size)) | |
571 :"memory"); | |
572 pix += line_size; | |
573 p += line_size; | |
574 } while(--h); | |
575 } | |
576 | |
577 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
578 { | |
579 UINT8 *p; | |
580 const UINT8 *pix; | |
581 p = block; | |
582 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
583 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
584 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
585 JUMPALIGN(); |
0 | 586 do { |
587 __asm __volatile( | |
588 "movq %1, %%mm0\n\t" | |
589 "movq %2, %%mm1\n\t" | |
590 "movq 1%1, %%mm4\n\t" | |
591 "movq 1%2, %%mm5\n\t" | |
592 "movq %%mm0, %%mm2\n\t" | |
593 "movq %%mm1, %%mm3\n\t" | |
594 "punpcklbw %%mm7, %%mm0\n\t" | |
595 "punpcklbw %%mm7, %%mm1\n\t" | |
596 "punpckhbw %%mm7, %%mm2\n\t" | |
597 "punpckhbw %%mm7, %%mm3\n\t" | |
598 "paddusw %%mm1, %%mm0\n\t" | |
599 "paddusw %%mm3, %%mm2\n\t" | |
600 "movq %%mm4, %%mm1\n\t" | |
601 "movq %%mm5, %%mm3\n\t" | |
602 "punpcklbw %%mm7, %%mm4\n\t" | |
603 "punpcklbw %%mm7, %%mm5\n\t" | |
604 "punpckhbw %%mm7, %%mm1\n\t" | |
605 "punpckhbw %%mm7, %%mm3\n\t" | |
606 "paddusw %%mm5, %%mm4\n\t" | |
607 "paddusw %%mm3, %%mm1\n\t" | |
608 "paddusw %%mm6, %%mm4\n\t" | |
609 "paddusw %%mm6, %%mm1\n\t" | |
610 "paddusw %%mm4, %%mm0\n\t" | |
611 "paddusw %%mm1, %%mm2\n\t" | |
612 "psrlw $2, %%mm0\n\t" | |
613 "psrlw $2, %%mm2\n\t" | |
614 "packuswb %%mm2, %%mm0\n\t" | |
615 "movq %%mm0, %0\n\t" | |
616 :"=m"(*p) | |
617 :"m"(*pix), | |
618 "m"(*(pix+line_size)) | |
619 :"memory"); | |
620 pix += line_size; | |
621 p += line_size; | |
622 } while(--h); | |
623 } | |
624 | |
625 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
626 { | |
627 UINT8 *p; | |
628 const UINT8 *pix; | |
629 p = block; | |
630 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
631 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
632 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
633 JUMPALIGN(); |
0 | 634 do { |
635 __asm __volatile( | |
636 "movq %0, %%mm0\n\t" | |
637 "movq %1, %%mm1\n\t" | |
638 "movq %%mm0, %%mm2\n\t" | |
639 "movq %%mm1, %%mm3\n\t" | |
640 "punpcklbw %%mm7, %%mm0\n\t" | |
641 "punpcklbw %%mm7, %%mm1\n\t" | |
642 "punpckhbw %%mm7, %%mm2\n\t" | |
643 "punpckhbw %%mm7, %%mm3\n\t" | |
644 "paddusw %%mm1, %%mm0\n\t" | |
645 "paddusw %%mm3, %%mm2\n\t" | |
646 "paddusw %%mm6, %%mm0\n\t" | |
647 "paddusw %%mm6, %%mm2\n\t" | |
648 "psrlw $1, %%mm0\n\t" | |
649 "psrlw $1, %%mm2\n\t" | |
650 "packuswb %%mm2, %%mm0\n\t" | |
651 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
652 :"+m"(*p) |
0 | 653 :"m"(*pix) |
654 :"memory"); | |
655 pix += line_size; | |
656 p += line_size; | |
657 } | |
658 while (--h); | |
659 } | |
660 | |
661 static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
662 { | |
663 UINT8 *p; | |
664 const UINT8 *pix; | |
665 p = block; | |
666 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
667 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
668 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
669 JUMPALIGN(); |
0 | 670 do { |
671 __asm __volatile( | |
672 "movq %1, %%mm1\n\t" | |
673 "movq %0, %%mm0\n\t" | |
674 "movq 1%1, %%mm4\n\t" | |
675 "movq %%mm0, %%mm2\n\t" | |
676 "movq %%mm1, %%mm3\n\t" | |
677 "movq %%mm4, %%mm5\n\t" | |
678 "punpcklbw %%mm7, %%mm1\n\t" | |
679 "punpckhbw %%mm7, %%mm3\n\t" | |
680 "punpcklbw %%mm7, %%mm4\n\t" | |
681 "punpckhbw %%mm7, %%mm5\n\t" | |
682 "punpcklbw %%mm7, %%mm0\n\t" | |
683 "punpckhbw %%mm7, %%mm2\n\t" | |
684 "paddusw %%mm4, %%mm1\n\t" | |
685 "paddusw %%mm5, %%mm3\n\t" | |
686 "paddusw %%mm6, %%mm1\n\t" | |
687 "paddusw %%mm6, %%mm3\n\t" | |
688 "psrlw $1, %%mm1\n\t" | |
689 "psrlw $1, %%mm3\n\t" | |
690 "paddusw %%mm6, %%mm0\n\t" | |
691 "paddusw %%mm6, %%mm2\n\t" | |
692 "paddusw %%mm1, %%mm0\n\t" | |
693 "paddusw %%mm3, %%mm2\n\t" | |
694 "psrlw $1, %%mm0\n\t" | |
695 "psrlw $1, %%mm2\n\t" | |
696 "packuswb %%mm2, %%mm0\n\t" | |
697 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
698 :"+m"(*p) |
0 | 699 :"m"(*pix) |
700 :"memory"); | |
701 pix += line_size; | |
702 p += line_size; | |
703 } while (--h); | |
704 } | |
705 | |
706 static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
707 { | |
708 UINT8 *p; | |
709 const UINT8 *pix; | |
710 p = block; | |
711 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
712 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
713 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
714 JUMPALIGN(); |
0 | 715 do { |
716 __asm __volatile( | |
717 "movq %1, %%mm1\n\t" | |
718 "movq %0, %%mm0\n\t" | |
719 "movq %2, %%mm4\n\t" | |
720 "movq %%mm0, %%mm2\n\t" | |
721 "movq %%mm1, %%mm3\n\t" | |
722 "movq %%mm4, %%mm5\n\t" | |
723 "punpcklbw %%mm7, %%mm1\n\t" | |
724 "punpckhbw %%mm7, %%mm3\n\t" | |
725 "punpcklbw %%mm7, %%mm4\n\t" | |
726 "punpckhbw %%mm7, %%mm5\n\t" | |
727 "punpcklbw %%mm7, %%mm0\n\t" | |
728 "punpckhbw %%mm7, %%mm2\n\t" | |
729 "paddusw %%mm4, %%mm1\n\t" | |
730 "paddusw %%mm5, %%mm3\n\t" | |
731 "paddusw %%mm6, %%mm1\n\t" | |
732 "paddusw %%mm6, %%mm3\n\t" | |
733 "psrlw $1, %%mm1\n\t" | |
734 "psrlw $1, %%mm3\n\t" | |
735 "paddusw %%mm6, %%mm0\n\t" | |
736 "paddusw %%mm6, %%mm2\n\t" | |
737 "paddusw %%mm1, %%mm0\n\t" | |
738 "paddusw %%mm3, %%mm2\n\t" | |
739 "psrlw $1, %%mm0\n\t" | |
740 "psrlw $1, %%mm2\n\t" | |
741 "packuswb %%mm2, %%mm0\n\t" | |
742 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
743 :"+m"(*p) |
0 | 744 :"m"(*pix), "m"(*(pix+line_size)) |
745 :"memory"); | |
746 pix += line_size; | |
747 p += line_size ; | |
748 } while(--h); | |
749 } | |
750 | |
751 static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
752 { | |
753 UINT8 *p; | |
754 const UINT8 *pix; | |
755 p = block; | |
756 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
757 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
758 // this doesn't seem to be used offten - so |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
759 // the inside usage of mm_wone is not optimized |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
760 MOVQ_WTWO(mm6); |
0 | 761 do { |
762 __asm __volatile( | |
763 "movq %1, %%mm0\n\t" | |
764 "movq %2, %%mm1\n\t" | |
765 "movq 1%1, %%mm4\n\t" | |
766 "movq 1%2, %%mm5\n\t" | |
767 "movq %%mm0, %%mm2\n\t" | |
768 "movq %%mm1, %%mm3\n\t" | |
769 "punpcklbw %%mm7, %%mm0\n\t" | |
770 "punpcklbw %%mm7, %%mm1\n\t" | |
771 "punpckhbw %%mm7, %%mm2\n\t" | |
772 "punpckhbw %%mm7, %%mm3\n\t" | |
773 "paddusw %%mm1, %%mm0\n\t" | |
774 "paddusw %%mm3, %%mm2\n\t" | |
775 "movq %%mm4, %%mm1\n\t" | |
776 "movq %%mm5, %%mm3\n\t" | |
777 "punpcklbw %%mm7, %%mm4\n\t" | |
778 "punpcklbw %%mm7, %%mm5\n\t" | |
779 "punpckhbw %%mm7, %%mm1\n\t" | |
780 "punpckhbw %%mm7, %%mm3\n\t" | |
781 "paddusw %%mm5, %%mm4\n\t" | |
782 "paddusw %%mm3, %%mm1\n\t" | |
783 "paddusw %%mm6, %%mm4\n\t" | |
784 "paddusw %%mm6, %%mm1\n\t" | |
785 "paddusw %%mm4, %%mm0\n\t" | |
786 "paddusw %%mm1, %%mm2\n\t" | |
787 "movq %3, %%mm5\n\t" | |
788 "psrlw $2, %%mm0\n\t" | |
789 "movq %0, %%mm1\n\t" | |
790 "psrlw $2, %%mm2\n\t" | |
791 "movq %%mm1, %%mm3\n\t" | |
792 "punpcklbw %%mm7, %%mm1\n\t" | |
793 "punpckhbw %%mm7, %%mm3\n\t" | |
794 "paddusw %%mm1, %%mm0\n\t" | |
795 "paddusw %%mm3, %%mm2\n\t" | |
796 "paddusw %%mm5, %%mm0\n\t" | |
797 "paddusw %%mm5, %%mm2\n\t" | |
798 "psrlw $1, %%mm0\n\t" | |
799 "psrlw $1, %%mm2\n\t" | |
800 "packuswb %%mm2, %%mm0\n\t" | |
801 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
802 :"+m"(*p) |
0 | 803 :"m"(*pix), |
8 | 804 "m"(*(pix+line_size)), "m"(mm_wone) |
0 | 805 :"memory"); |
806 pix += line_size; | |
807 p += line_size ; | |
808 } while(--h); | |
809 } | |
810 | |
811 static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
812 { | |
813 UINT8 *p; | |
814 const UINT8 *pix; | |
815 p = block; | |
816 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
817 MOVQ_ZERO(mm7); |
0 | 818 do { |
819 __asm __volatile( | |
820 "movq %1, %%mm0\n\t" | |
821 "movq %0, %%mm1\n\t" | |
822 "movq %%mm0, %%mm2\n\t" | |
823 "movq %%mm1, %%mm3\n\t" | |
824 "punpcklbw %%mm7, %%mm0\n\t" | |
825 "punpcklbw %%mm7, %%mm1\n\t" | |
826 "punpckhbw %%mm7, %%mm2\n\t" | |
827 "punpckhbw %%mm7, %%mm3\n\t" | |
828 "paddusw %%mm1, %%mm0\n\t" | |
829 "paddusw %%mm3, %%mm2\n\t" | |
830 "psrlw $1, %%mm0\n\t" | |
831 "psrlw $1, %%mm2\n\t" | |
832 "packuswb %%mm2, %%mm0\n\t" | |
833 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
834 :"+m"(*p) |
0 | 835 :"m"(*pix) |
836 :"memory"); | |
837 pix += line_size; | |
838 p += line_size ; | |
839 } while (--h); | |
840 } | |
841 | |
842 static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
843 { | |
844 UINT8 *p; | |
845 const UINT8 *pix; | |
846 p = block; | |
847 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
848 MOVQ_ZERO(mm7); |
0 | 849 do { |
850 __asm __volatile( | |
851 "movq %1, %%mm0\n\t" | |
852 "movq 1%1, %%mm1\n\t" | |
853 "movq %0, %%mm4\n\t" | |
854 "movq %%mm0, %%mm2\n\t" | |
855 "movq %%mm1, %%mm3\n\t" | |
856 "movq %%mm4, %%mm5\n\t" | |
857 "punpcklbw %%mm7, %%mm0\n\t" | |
858 "punpcklbw %%mm7, %%mm1\n\t" | |
859 "punpckhbw %%mm7, %%mm2\n\t" | |
860 "punpckhbw %%mm7, %%mm3\n\t" | |
861 "punpcklbw %%mm7, %%mm4\n\t" | |
862 "punpckhbw %%mm7, %%mm5\n\t" | |
863 "paddusw %%mm1, %%mm0\n\t" | |
864 "paddusw %%mm3, %%mm2\n\t" | |
865 "psrlw $1, %%mm0\n\t" | |
866 "psrlw $1, %%mm2\n\t" | |
867 "paddusw %%mm4, %%mm0\n\t" | |
868 "paddusw %%mm5, %%mm2\n\t" | |
869 "psrlw $1, %%mm0\n\t" | |
870 "psrlw $1, %%mm2\n\t" | |
871 "packuswb %%mm2, %%mm0\n\t" | |
872 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
873 :"+m"(*p) |
0 | 874 :"m"(*pix) |
875 :"memory"); | |
876 pix += line_size; | |
877 p += line_size; | |
878 } while (--h); | |
879 } | |
880 | |
881 static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
882 { | |
883 UINT8 *p; | |
884 const UINT8 *pix; | |
885 p = block; | |
886 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
887 MOVQ_ZERO(mm7); |
0 | 888 do { |
889 __asm __volatile( | |
890 "movq %1, %%mm0\n\t" | |
891 "movq %2, %%mm1\n\t" | |
892 "movq %0, %%mm4\n\t" | |
893 "movq %%mm0, %%mm2\n\t" | |
894 "movq %%mm1, %%mm3\n\t" | |
895 "movq %%mm4, %%mm5\n\t" | |
896 "punpcklbw %%mm7, %%mm0\n\t" | |
897 "punpcklbw %%mm7, %%mm1\n\t" | |
898 "punpckhbw %%mm7, %%mm2\n\t" | |
899 "punpckhbw %%mm7, %%mm3\n\t" | |
900 "punpcklbw %%mm7, %%mm4\n\t" | |
901 "punpckhbw %%mm7, %%mm5\n\t" | |
902 "paddusw %%mm1, %%mm0\n\t" | |
903 "paddusw %%mm3, %%mm2\n\t" | |
904 "psrlw $1, %%mm0\n\t" | |
905 "psrlw $1, %%mm2\n\t" | |
906 "paddusw %%mm4, %%mm0\n\t" | |
907 "paddusw %%mm5, %%mm2\n\t" | |
908 "psrlw $1, %%mm0\n\t" | |
909 "psrlw $1, %%mm2\n\t" | |
910 "packuswb %%mm2, %%mm0\n\t" | |
911 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
912 :"+m"(*p) |
0 | 913 :"m"(*pix), "m"(*(pix+line_size)) |
914 :"memory"); | |
915 pix += line_size; | |
916 p += line_size ; | |
917 } while(--h); | |
918 } | |
919 | |
920 static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
921 { | |
922 UINT8 *p; | |
923 const UINT8 *pix; | |
924 p = block; | |
925 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
926 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
927 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
928 JUMPALIGN(); |
0 | 929 do { |
930 __asm __volatile( | |
931 "movq %1, %%mm0\n\t" | |
932 "movq %2, %%mm1\n\t" | |
933 "movq 1%1, %%mm4\n\t" | |
934 "movq 1%2, %%mm5\n\t" | |
935 "movq %%mm0, %%mm2\n\t" | |
936 "movq %%mm1, %%mm3\n\t" | |
937 "punpcklbw %%mm7, %%mm0\n\t" | |
938 "punpcklbw %%mm7, %%mm1\n\t" | |
939 "punpckhbw %%mm7, %%mm2\n\t" | |
940 "punpckhbw %%mm7, %%mm3\n\t" | |
941 "paddusw %%mm1, %%mm0\n\t" | |
942 "paddusw %%mm3, %%mm2\n\t" | |
943 "movq %%mm4, %%mm1\n\t" | |
944 "movq %%mm5, %%mm3\n\t" | |
945 "punpcklbw %%mm7, %%mm4\n\t" | |
946 "punpcklbw %%mm7, %%mm5\n\t" | |
947 "punpckhbw %%mm7, %%mm1\n\t" | |
948 "punpckhbw %%mm7, %%mm3\n\t" | |
949 "paddusw %%mm5, %%mm4\n\t" | |
950 "paddusw %%mm3, %%mm1\n\t" | |
951 "paddusw %%mm6, %%mm4\n\t" | |
952 "paddusw %%mm6, %%mm1\n\t" | |
953 "paddusw %%mm4, %%mm0\n\t" | |
954 "paddusw %%mm1, %%mm2\n\t" | |
955 "movq %0, %%mm1\n\t" | |
956 "psrlw $2, %%mm0\n\t" | |
957 "movq %%mm1, %%mm3\n\t" | |
958 "psrlw $2, %%mm2\n\t" | |
959 "punpcklbw %%mm7, %%mm1\n\t" | |
960 "punpckhbw %%mm7, %%mm3\n\t" | |
961 "paddusw %%mm1, %%mm0\n\t" | |
962 "paddusw %%mm3, %%mm2\n\t" | |
963 "psrlw $1, %%mm0\n\t" | |
964 "psrlw $1, %%mm2\n\t" | |
965 "packuswb %%mm2, %%mm0\n\t" | |
966 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
967 :"+m"(*p) |
0 | 968 :"m"(*pix), |
969 "m"(*(pix+line_size)) | |
970 :"memory"); | |
971 pix += line_size; | |
972 p += line_size; | |
973 } while(--h); | |
974 } | |
975 | |
296 | 976 static void clear_blocks_mmx(DCTELEM *blocks) |
977 { | |
978 asm volatile( | |
979 "pxor %%mm7, %%mm7 \n\t" | |
980 "movl $-128*6, %%eax \n\t" | |
981 "1: \n\t" | |
982 "movq %%mm7, (%0, %%eax) \n\t" | |
983 "movq %%mm7, 8(%0, %%eax) \n\t" | |
984 "movq %%mm7, 16(%0, %%eax) \n\t" | |
985 "movq %%mm7, 24(%0, %%eax) \n\t" | |
986 "addl $32, %%eax \n\t" | |
987 " js 1b \n\t" | |
988 : : "r" (((int)blocks)+128*6) | |
989 : "%eax" | |
990 ); | |
991 } | |
992 | |
393 | 993 #if 0 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
994 static void just_return() { return; } |
393 | 995 #endif |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
996 |
0 | 997 void dsputil_init_mmx(void) |
998 { | |
999 mm_flags = mm_support(); | |
188 | 1000 #if 1 |
1001 printf("libavcodec: CPU flags:"); | |
0 | 1002 if (mm_flags & MM_MMX) |
1003 printf(" mmx"); | |
1004 if (mm_flags & MM_MMXEXT) | |
1005 printf(" mmxext"); | |
1006 if (mm_flags & MM_3DNOW) | |
1007 printf(" 3dnow"); | |
1008 if (mm_flags & MM_SSE) | |
1009 printf(" sse"); | |
1010 if (mm_flags & MM_SSE2) | |
1011 printf(" sse2"); | |
1012 printf("\n"); | |
1013 #endif | |
1014 | |
1015 if (mm_flags & MM_MMX) { | |
1016 get_pixels = get_pixels_mmx; | |
324 | 1017 diff_pixels = diff_pixels_mmx; |
0 | 1018 put_pixels_clamped = put_pixels_clamped_mmx; |
1019 add_pixels_clamped = add_pixels_clamped_mmx; | |
296 | 1020 clear_blocks= clear_blocks_mmx; |
415 | 1021 |
294 | 1022 pix_abs16x16 = pix_abs16x16_mmx; |
1023 pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
1024 pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
0 | 1025 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; |
294 | 1026 pix_abs8x8 = pix_abs8x8_mmx; |
1027 pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | |
1028 pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | |
1029 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | |
0 | 1030 av_fdct = fdct_mmx; |
1031 | |
1032 put_pixels_tab[0] = put_pixels_mmx; | |
1033 put_pixels_tab[1] = put_pixels_x2_mmx; | |
1034 put_pixels_tab[2] = put_pixels_y2_mmx; | |
1035 put_pixels_tab[3] = put_pixels_xy2_mmx; | |
1036 | |
1037 put_no_rnd_pixels_tab[0] = put_pixels_mmx; | |
1038 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
1039 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1040 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | |
415 | 1041 |
0 | 1042 avg_pixels_tab[0] = avg_pixels_mmx; |
1043 avg_pixels_tab[1] = avg_pixels_x2_mmx; | |
1044 avg_pixels_tab[2] = avg_pixels_y2_mmx; | |
1045 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1046 | |
1047 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | |
1048 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | |
1049 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | |
1050 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | |
386 | 1051 |
0 | 1052 if (mm_flags & MM_MMXEXT) { |
294 | 1053 pix_abs16x16 = pix_abs16x16_mmx2; |
1054 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; | |
1055 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; | |
1056 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2; | |
415 | 1057 |
294 | 1058 pix_abs8x8 = pix_abs8x8_mmx2; |
1059 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; | |
1060 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; | |
1061 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; | |
386 | 1062 |
1063 put_pixels_tab[1] = put_pixels_x2_mmx2; | |
1064 put_pixels_tab[2] = put_pixels_y2_mmx2; | |
1065 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2; | |
1066 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2; | |
415 | 1067 |
386 | 1068 avg_pixels_tab[0] = avg_pixels_mmx2; |
1069 avg_pixels_tab[1] = avg_pixels_x2_mmx2; | |
1070 avg_pixels_tab[2] = avg_pixels_y2_mmx2; | |
1071 avg_pixels_tab[3] = avg_pixels_xy2_mmx2; | |
0 | 1072 } else if (mm_flags & MM_3DNOW) { |
1073 put_pixels_tab[1] = put_pixels_x2_3dnow; | |
1074 put_pixels_tab[2] = put_pixels_y2_3dnow; | |
386 | 1075 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow; |
1076 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow; | |
393 | 1077 |
0 | 1078 avg_pixels_tab[0] = avg_pixels_3dnow; |
1079 avg_pixels_tab[1] = avg_pixels_x2_3dnow; | |
1080 avg_pixels_tab[2] = avg_pixels_y2_3dnow; | |
1081 avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | |
1082 } | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
1083 |
42 | 1084 /* idct */ |
1085 if (mm_flags & MM_MMXEXT) { | |
1086 ff_idct = ff_mmxext_idct; | |
1087 } else { | |
1088 ff_idct = ff_mmx_idct; | |
1089 } | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1090 #ifdef SIMPLE_IDCT |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1091 // ff_idct = simple_idct; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1092 ff_idct = simple_idct_mmx; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
1093 #endif |
0 | 1094 } |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1095 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1096 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1097 // for speed testing |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1098 get_pixels = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1099 put_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1100 add_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1101 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1102 pix_abs16x16 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1103 pix_abs16x16_x2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1104 pix_abs16x16_y2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1105 pix_abs16x16_xy2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1106 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1107 put_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1108 put_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1109 put_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1110 put_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1111 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1112 put_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1113 put_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1114 put_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1115 put_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1116 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1117 avg_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1118 avg_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1119 avg_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1120 avg_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1121 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1122 avg_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1123 avg_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1124 avg_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1125 avg_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1126 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1127 //av_fdct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1128 //ff_idct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1129 #endif |
0 | 1130 } |
402 | 1131 |
1132 /* remove any non bit exact operation (testing purpose). NOTE that | |
1133 this function should be kept as small as possible because it is | |
1134 always difficult to test automatically non bit exact cases. */ | |
1135 void dsputil_set_bit_exact_mmx(void) | |
1136 { | |
1137 if (mm_flags & MM_MMX) { | |
1138 if (mm_flags & MM_MMXEXT) { | |
1139 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
1140 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1141 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1142 } else if (mm_flags & MM_3DNOW) { | |
1143 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
1144 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
1145 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
1146 } | |
1147 } | |
1148 } |