Mercurial > libavcodec.hg
annotate i386/dsputil_mmx_avg.h @ 1095:c7604e6291c5 libavcodec
extended option for h263+ patch by (fixounet at free dot fr) with some minor modifications
author | michaelni |
---|---|
date | Wed, 05 Mar 2003 16:10:13 +0000 |
parents | b32afefe7d33 |
children | 07a484280a82 |
rev | line source |
---|---|
0 | 1 /* |
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2 | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
386 | 4 * Copyright (c) 2002 Michael Niedermayer |
0 | 5 * |
429 | 6 * This library is free software; you can redistribute it and/or |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2 of the License, or (at your option) any later version. | |
0 | 10 * |
429 | 11 * This library is distributed in the hope that it will be useful, |
0 | 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 * Lesser General Public License for more details. | |
0 | 15 * |
429 | 16 * You should have received a copy of the GNU Lesser General Public |
17 * License along with this library; if not, write to the Free Software | |
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
0 | 19 * |
20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
386 | 21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> |
441 | 22 * and improved by Zdenek Kabelac <kabi@users.sf.net> |
0 | 23 */ |
387 | 24 |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
25 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
26 clobber bug - now it will work with 2.95.2 and also with -fPIC |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
27 */ |
1064 | 28 static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 29 { |
386 | 30 __asm __volatile( |
441 | 31 "lea (%3, %3), %%eax \n\t" |
32 "1: \n\t" | |
33 "movq (%1), %%mm0 \n\t" | |
34 "movq (%1, %3), %%mm1 \n\t" | |
35 PAVGB" 1(%1), %%mm0 \n\t" | |
36 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
37 "movq %%mm0, (%2) \n\t" | |
38 "movq %%mm1, (%2, %3) \n\t" | |
39 "addl %%eax, %1 \n\t" | |
40 "addl %%eax, %2 \n\t" | |
41 "movq (%1), %%mm0 \n\t" | |
42 "movq (%1, %3), %%mm1 \n\t" | |
43 PAVGB" 1(%1), %%mm0 \n\t" | |
44 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
45 "addl %%eax, %1 \n\t" | |
46 "movq %%mm0, (%2) \n\t" | |
47 "movq %%mm1, (%2, %3) \n\t" | |
48 "addl %%eax, %2 \n\t" | |
49 "subl $4, %0 \n\t" | |
50 "jnz 1b \n\t" | |
51 :"+g"(h), "+S"(pixels), "+D"(block) | |
52 :"r" (line_size) | |
53 :"%eax", "memory"); | |
54 } | |
651 | 55 |
954 | 56 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
57 { | |
58 __asm __volatile( | |
59 "1: \n\t" | |
60 "movq (%1), %%mm0 \n\t" | |
61 "addl %4, %1 \n\t" | |
62 "movq (%1), %%mm1 \n\t" | |
63 "addl %4, %1 \n\t" | |
64 PAVGB" (%2), %%mm0 \n\t" | |
65 PAVGB" 8(%2), %%mm1 \n\t" | |
66 "movq %%mm0, (%3) \n\t" | |
67 "addl %5, %3 \n\t" | |
68 "movq %%mm1, (%3) \n\t" | |
69 "addl %5, %3 \n\t" | |
70 "movq (%1), %%mm0 \n\t" | |
71 "addl %4, %1 \n\t" | |
72 "movq (%1), %%mm1 \n\t" | |
73 "addl %4, %1 \n\t" | |
74 PAVGB" 16(%2), %%mm0 \n\t" | |
75 PAVGB" 24(%2), %%mm1 \n\t" | |
76 "movq %%mm0, (%3) \n\t" | |
77 "addl %5, %3 \n\t" | |
78 "movq %%mm1, (%3) \n\t" | |
79 "addl %5, %3 \n\t" | |
80 "addl $32, %2 \n\t" | |
81 "subl $4, %0 \n\t" | |
82 "jnz 1b \n\t" | |
83 :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
84 :"r"(src1Stride), "r"(dstStride) | |
85 :"memory"); | |
86 } | |
87 | |
1064 | 88 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
651 | 89 { |
90 __asm __volatile( | |
91 "lea (%3, %3), %%eax \n\t" | |
92 "1: \n\t" | |
93 "movq (%1), %%mm0 \n\t" | |
94 "movq (%1, %3), %%mm1 \n\t" | |
95 "movq 8(%1), %%mm2 \n\t" | |
96 "movq 8(%1, %3), %%mm3 \n\t" | |
97 PAVGB" 1(%1), %%mm0 \n\t" | |
98 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
99 PAVGB" 9(%1), %%mm2 \n\t" | |
100 PAVGB" 9(%1, %3), %%mm3 \n\t" | |
101 "movq %%mm0, (%2) \n\t" | |
102 "movq %%mm1, (%2, %3) \n\t" | |
103 "movq %%mm2, 8(%2) \n\t" | |
104 "movq %%mm3, 8(%2, %3) \n\t" | |
105 "addl %%eax, %1 \n\t" | |
106 "addl %%eax, %2 \n\t" | |
107 "movq (%1), %%mm0 \n\t" | |
108 "movq (%1, %3), %%mm1 \n\t" | |
109 "movq 8(%1), %%mm2 \n\t" | |
110 "movq 8(%1, %3), %%mm3 \n\t" | |
111 PAVGB" 1(%1), %%mm0 \n\t" | |
112 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
113 PAVGB" 9(%1), %%mm2 \n\t" | |
114 PAVGB" 9(%1, %3), %%mm3 \n\t" | |
115 "addl %%eax, %1 \n\t" | |
116 "movq %%mm0, (%2) \n\t" | |
117 "movq %%mm1, (%2, %3) \n\t" | |
118 "movq %%mm2, 8(%2) \n\t" | |
119 "movq %%mm3, 8(%2, %3) \n\t" | |
120 "addl %%eax, %2 \n\t" | |
121 "subl $4, %0 \n\t" | |
122 "jnz 1b \n\t" | |
123 :"+g"(h), "+S"(pixels), "+D"(block) | |
124 :"r" (line_size) | |
125 :"%eax", "memory"); | |
126 } | |
954 | 127 |
128 static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
129 { | |
130 __asm __volatile( | |
131 "1: \n\t" | |
132 "movq (%1), %%mm0 \n\t" | |
133 "movq 8(%1), %%mm1 \n\t" | |
134 "addl %4, %1 \n\t" | |
135 PAVGB" (%2), %%mm0 \n\t" | |
136 PAVGB" 8(%2), %%mm1 \n\t" | |
137 "movq %%mm0, (%3) \n\t" | |
138 "movq %%mm1, 8(%3) \n\t" | |
139 "addl %5, %3 \n\t" | |
140 "movq (%1), %%mm0 \n\t" | |
141 "movq 8(%1), %%mm1 \n\t" | |
142 "addl %4, %1 \n\t" | |
143 PAVGB" 16(%2), %%mm0 \n\t" | |
144 PAVGB" 24(%2), %%mm1 \n\t" | |
145 "movq %%mm0, (%3) \n\t" | |
146 "movq %%mm1, 8(%3) \n\t" | |
147 "addl %5, %3 \n\t" | |
148 "addl $32, %2 \n\t" | |
149 "subl $2, %0 \n\t" | |
150 "jnz 1b \n\t" | |
151 :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
152 :"r"(src1Stride), "r"(dstStride) | |
153 :"memory"); | |
154 } | |
441 | 155 |
156 /* GL: this function does incorrect rounding if overflow */ | |
1064 | 157 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
441 | 158 { |
448 | 159 MOVQ_BONE(mm6); |
441 | 160 __asm __volatile( |
161 "lea (%3, %3), %%eax \n\t" | |
439 | 162 "1: \n\t" |
163 "movq (%1), %%mm0 \n\t" | |
164 "movq (%1, %3), %%mm2 \n\t" | |
165 "movq 1(%1), %%mm1 \n\t" | |
166 "movq 1(%1, %3), %%mm3 \n\t" | |
441 | 167 "addl %%eax, %1 \n\t" |
448 | 168 "psubusb %%mm6, %%mm0 \n\t" |
169 "psubusb %%mm6, %%mm2 \n\t" | |
386 | 170 PAVGB" %%mm1, %%mm0 \n\t" |
171 PAVGB" %%mm3, %%mm2 \n\t" | |
439 | 172 "movq %%mm0, (%2) \n\t" |
173 "movq %%mm2, (%2, %3) \n\t" | |
174 "movq (%1), %%mm0 \n\t" | |
175 "movq 1(%1), %%mm1 \n\t" | |
176 "movq (%1, %3), %%mm2 \n\t" | |
177 "movq 1(%1, %3), %%mm3 \n\t" | |
441 | 178 "addl %%eax, %2 \n\t" |
439 | 179 "addl %%eax, %1 \n\t" |
448 | 180 "psubusb %%mm6, %%mm0 \n\t" |
181 "psubusb %%mm6, %%mm2 \n\t" | |
386 | 182 PAVGB" %%mm1, %%mm0 \n\t" |
183 PAVGB" %%mm3, %%mm2 \n\t" | |
439 | 184 "movq %%mm0, (%2) \n\t" |
185 "movq %%mm2, (%2, %3) \n\t" | |
441 | 186 "addl %%eax, %2 \n\t" |
187 "subl $4, %0 \n\t" | |
188 "jnz 1b \n\t" | |
189 :"+g"(h), "+S"(pixels), "+D"(block) | |
190 :"r" (line_size) | |
386 | 191 :"%eax", "memory"); |
0 | 192 } |
193 | |
1064 | 194 static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 195 { |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
196 __asm __volatile( |
441 | 197 "lea (%3, %3), %%eax \n\t" |
198 "movq (%1), %%mm0 \n\t" | |
199 "subl %3, %2 \n\t" | |
200 "1: \n\t" | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
201 "movq (%1, %3), %%mm1 \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
202 "movq (%1, %%eax), %%mm2 \n\t" |
441 | 203 "addl %%eax, %1 \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
204 PAVGB" %%mm1, %%mm0 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
205 PAVGB" %%mm2, %%mm1 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
206 "movq %%mm0, (%2, %3) \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
207 "movq %%mm1, (%2, %%eax) \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
208 "movq (%1, %3), %%mm1 \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
209 "movq (%1, %%eax), %%mm0 \n\t" |
441 | 210 "addl %%eax, %2 \n\t" |
211 "addl %%eax, %1 \n\t" | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
212 PAVGB" %%mm1, %%mm2 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
213 PAVGB" %%mm0, %%mm1 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
214 "movq %%mm2, (%2, %3) \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
215 "movq %%mm1, (%2, %%eax) \n\t" |
441 | 216 "addl %%eax, %2 \n\t" |
217 "subl $4, %0 \n\t" | |
218 "jnz 1b \n\t" | |
219 :"+g"(h), "+S"(pixels), "+D" (block) | |
220 :"r" (line_size) | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
221 :"%eax", "memory"); |
386 | 222 } |
223 | |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
224 /* GL: this function does incorrect rounding if overflow */ |
1064 | 225 static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
386 | 226 { |
448 | 227 MOVQ_BONE(mm6); |
0 | 228 __asm __volatile( |
441 | 229 "lea (%3, %3), %%eax \n\t" |
230 "movq (%1), %%mm0 \n\t" | |
231 "subl %3, %2 \n\t" | |
232 "1: \n\t" | |
439 | 233 "movq (%1, %3), %%mm1 \n\t" |
234 "movq (%1, %%eax), %%mm2 \n\t" | |
441 | 235 "addl %%eax, %1 \n\t" |
448 | 236 "psubusb %%mm6, %%mm1 \n\t" |
386 | 237 PAVGB" %%mm1, %%mm0 \n\t" |
238 PAVGB" %%mm2, %%mm1 \n\t" | |
439 | 239 "movq %%mm0, (%2, %3) \n\t" |
240 "movq %%mm1, (%2, %%eax) \n\t" | |
241 "movq (%1, %3), %%mm1 \n\t" | |
242 "movq (%1, %%eax), %%mm0 \n\t" | |
441 | 243 "addl %%eax, %2 \n\t" |
244 "addl %%eax, %1 \n\t" | |
448 | 245 "psubusb %%mm6, %%mm1 \n\t" |
386 | 246 PAVGB" %%mm1, %%mm2 \n\t" |
247 PAVGB" %%mm0, %%mm1 \n\t" | |
439 | 248 "movq %%mm2, (%2, %3) \n\t" |
249 "movq %%mm1, (%2, %%eax) \n\t" | |
441 | 250 "addl %%eax, %2 \n\t" |
251 "subl $4, %0 \n\t" | |
252 "jnz 1b \n\t" | |
253 :"+g"(h), "+S"(pixels), "+D" (block) | |
254 :"r" (line_size) | |
439 | 255 :"%eax", "memory"); |
0 | 256 } |
257 | |
1064 | 258 static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 259 { |
260 __asm __volatile( | |
441 | 261 "lea (%3, %3), %%eax \n\t" |
262 "1: \n\t" | |
263 "movq (%2), %%mm0 \n\t" | |
264 "movq (%2, %3), %%mm1 \n\t" | |
265 PAVGB" (%1), %%mm0 \n\t" | |
266 PAVGB" (%1, %3), %%mm1 \n\t" | |
267 "movq %%mm0, (%2) \n\t" | |
268 "movq %%mm1, (%2, %3) \n\t" | |
269 "addl %%eax, %1 \n\t" | |
270 "addl %%eax, %2 \n\t" | |
271 "movq (%2), %%mm0 \n\t" | |
272 "movq (%2, %3), %%mm1 \n\t" | |
273 PAVGB" (%1), %%mm0 \n\t" | |
274 PAVGB" (%1, %3), %%mm1 \n\t" | |
275 "addl %%eax, %1 \n\t" | |
276 "movq %%mm0, (%2) \n\t" | |
277 "movq %%mm1, (%2, %3) \n\t" | |
278 "addl %%eax, %2 \n\t" | |
279 "subl $4, %0 \n\t" | |
280 "jnz 1b \n\t" | |
281 :"+g"(h), "+S"(pixels), "+D"(block) | |
282 :"r" (line_size) | |
386 | 283 :"%eax", "memory"); |
0 | 284 } |
285 | |
1064 | 286 static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 287 { |
288 __asm __volatile( | |
441 | 289 "lea (%3, %3), %%eax \n\t" |
290 "1: \n\t" | |
291 "movq (%1), %%mm0 \n\t" | |
292 "movq (%1, %3), %%mm2 \n\t" | |
293 PAVGB" 1(%1), %%mm0 \n\t" | |
294 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
295 PAVGB" (%2), %%mm0 \n\t" | |
296 PAVGB" (%2, %3), %%mm2 \n\t" | |
297 "addl %%eax, %1 \n\t" | |
298 "movq %%mm0, (%2) \n\t" | |
299 "movq %%mm2, (%2, %3) \n\t" | |
300 "movq (%1), %%mm0 \n\t" | |
301 "movq (%1, %3), %%mm2 \n\t" | |
302 PAVGB" 1(%1), %%mm0 \n\t" | |
303 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
304 "addl %%eax, %2 \n\t" | |
305 "addl %%eax, %1 \n\t" | |
306 PAVGB" (%2), %%mm0 \n\t" | |
307 PAVGB" (%2, %3), %%mm2 \n\t" | |
308 "movq %%mm0, (%2) \n\t" | |
309 "movq %%mm2, (%2, %3) \n\t" | |
310 "addl %%eax, %2 \n\t" | |
311 "subl $4, %0 \n\t" | |
312 "jnz 1b \n\t" | |
313 :"+g"(h), "+S"(pixels), "+D"(block) | |
314 :"r" (line_size) | |
386 | 315 :"%eax", "memory"); |
0 | 316 } |
317 | |
1064 | 318 static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 319 { |
320 __asm __volatile( | |
441 | 321 "lea (%3, %3), %%eax \n\t" |
386 | 322 "movq (%1), %%mm0 \n\t" |
441 | 323 "subl %3, %2 \n\t" |
324 "1: \n\t" | |
325 "movq (%1, %3), %%mm1 \n\t" | |
326 "movq (%1, %%eax), %%mm2 \n\t" | |
327 "addl %%eax, %1 \n\t" | |
386 | 328 PAVGB" %%mm1, %%mm0 \n\t" |
329 PAVGB" %%mm2, %%mm1 \n\t" | |
441 | 330 "movq (%2, %3), %%mm3 \n\t" |
331 "movq (%2, %%eax), %%mm4 \n\t" | |
386 | 332 PAVGB" %%mm3, %%mm0 \n\t" |
333 PAVGB" %%mm4, %%mm1 \n\t" | |
441 | 334 "movq %%mm0, (%2, %3) \n\t" |
335 "movq %%mm1, (%2, %%eax) \n\t" | |
336 "movq (%1, %3), %%mm1 \n\t" | |
337 "movq (%1, %%eax), %%mm0 \n\t" | |
386 | 338 PAVGB" %%mm1, %%mm2 \n\t" |
339 PAVGB" %%mm0, %%mm1 \n\t" | |
441 | 340 "addl %%eax, %2 \n\t" |
341 "addl %%eax, %1 \n\t" | |
342 "movq (%2, %3), %%mm3 \n\t" | |
343 "movq (%2, %%eax), %%mm4 \n\t" | |
386 | 344 PAVGB" %%mm3, %%mm2 \n\t" |
345 PAVGB" %%mm4, %%mm1 \n\t" | |
441 | 346 "movq %%mm2, (%2, %3) \n\t" |
347 "movq %%mm1, (%2, %%eax) \n\t" | |
348 "addl %%eax, %2 \n\t" | |
349 "subl $4, %0 \n\t" | |
350 "jnz 1b \n\t" | |
351 :"+g"(h), "+S"(pixels), "+D"(block) | |
352 :"r" (line_size) | |
353 :"%eax", "memory"); | |
0 | 354 } |
355 | |
386 | 356 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter |
1064 | 357 static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 358 { |
448 | 359 MOVQ_BONE(mm6); |
0 | 360 __asm __volatile( |
442 | 361 "lea (%3, %3), %%eax \n\t" |
386 | 362 "movq (%1), %%mm0 \n\t" |
442 | 363 PAVGB" 1(%1), %%mm0 \n\t" |
364 ".balign 8 \n\t" | |
441 | 365 "1: \n\t" |
442 | 366 "movq (%1, %%eax), %%mm2 \n\t" |
367 "movq (%1, %3), %%mm1 \n\t" | |
448 | 368 "psubusb %%mm6, %%mm2 \n\t" |
442 | 369 PAVGB" 1(%1, %3), %%mm1 \n\t" |
370 PAVGB" 1(%1, %%eax), %%mm2 \n\t" | |
371 "addl %%eax, %1 \n\t" | |
386 | 372 PAVGB" %%mm1, %%mm0 \n\t" |
373 PAVGB" %%mm2, %%mm1 \n\t" | |
442 | 374 PAVGB" (%2), %%mm0 \n\t" |
375 PAVGB" (%2, %3), %%mm1 \n\t" | |
376 "movq %%mm0, (%2) \n\t" | |
377 "movq %%mm1, (%2, %3) \n\t" | |
378 "movq (%1, %3), %%mm1 \n\t" | |
379 "movq (%1, %%eax), %%mm0 \n\t" | |
380 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
381 PAVGB" 1(%1, %%eax), %%mm0 \n\t" | |
382 "addl %%eax, %2 \n\t" | |
383 "addl %%eax, %1 \n\t" | |
386 | 384 PAVGB" %%mm1, %%mm2 \n\t" |
385 PAVGB" %%mm0, %%mm1 \n\t" | |
442 | 386 PAVGB" (%2), %%mm2 \n\t" |
387 PAVGB" (%2, %3), %%mm1 \n\t" | |
388 "movq %%mm2, (%2) \n\t" | |
389 "movq %%mm1, (%2, %3) \n\t" | |
390 "addl %%eax, %2 \n\t" | |
441 | 391 "subl $4, %0 \n\t" |
442 | 392 "jnz 1b \n\t" |
443 | 393 :"+g"(h), "+S"(pixels), "+D"(block) |
442 | 394 :"r" (line_size) |
386 | 395 :"%eax", "memory"); |
0 | 396 } |
651 | 397 |
398 //FIXME the following could be optimized too ... | |
1064 | 399 static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 400 DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); |
401 DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); | |
402 } | |
1064 | 403 static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 404 DEF(put_pixels8_y2)(block , pixels , line_size, h); |
405 DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h); | |
406 } | |
1064 | 407 static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 408 DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h); |
409 DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); | |
410 } | |
1064 | 411 static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 412 DEF(avg_pixels8)(block , pixels , line_size, h); |
413 DEF(avg_pixels8)(block+8, pixels+8, line_size, h); | |
414 } | |
1064 | 415 static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 416 DEF(avg_pixels8_x2)(block , pixels , line_size, h); |
417 DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h); | |
418 } | |
1064 | 419 static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 420 DEF(avg_pixels8_y2)(block , pixels , line_size, h); |
421 DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h); | |
422 } | |
1064 | 423 static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 424 DEF(avg_pixels8_xy2)(block , pixels , line_size, h); |
425 DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); | |
426 } | |
427 |