Mercurial > libavcodec.hg
annotate i386/dsputil_mmx_avg.h @ 909:8ae1e4c24e91 libavcodec
new PSNR code (now works with chroma, b frames, ...)
rename *_TYPE to FF_*_TYPE for the external API
allow user specified pict_type
author | michaelni |
---|---|
date | Wed, 04 Dec 2002 21:13:02 +0000 |
parents | 45e8f39fda50 |
children | 13aec7e50c52 |
rev | line source |
---|---|
0 | 1 /* |
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2 | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
386 | 4 * Copyright (c) 2002 Michael Niedermayer |
0 | 5 * |
429 | 6 * This library is free software; you can redistribute it and/or |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2 of the License, or (at your option) any later version. | |
0 | 10 * |
429 | 11 * This library is distributed in the hope that it will be useful, |
0 | 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 * Lesser General Public License for more details. | |
0 | 15 * |
429 | 16 * You should have received a copy of the GNU Lesser General Public |
17 * License along with this library; if not, write to the Free Software | |
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
0 | 19 * |
20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
386 | 21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> |
441 | 22 * and improved by Zdenek Kabelac <kabi@users.sf.net> |
0 | 23 */ |
387 | 24 |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
25 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
26 clobber bug - now it will work with 2.95.2 and also with -fPIC |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
27 */ |
651 | 28 static void DEF(put_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
0 | 29 { |
386 | 30 __asm __volatile( |
441 | 31 "lea (%3, %3), %%eax \n\t" |
32 "1: \n\t" | |
33 "movq (%1), %%mm0 \n\t" | |
34 "movq (%1, %3), %%mm1 \n\t" | |
35 PAVGB" 1(%1), %%mm0 \n\t" | |
36 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
37 "movq %%mm0, (%2) \n\t" | |
38 "movq %%mm1, (%2, %3) \n\t" | |
39 "addl %%eax, %1 \n\t" | |
40 "addl %%eax, %2 \n\t" | |
41 "movq (%1), %%mm0 \n\t" | |
42 "movq (%1, %3), %%mm1 \n\t" | |
43 PAVGB" 1(%1), %%mm0 \n\t" | |
44 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
45 "addl %%eax, %1 \n\t" | |
46 "movq %%mm0, (%2) \n\t" | |
47 "movq %%mm1, (%2, %3) \n\t" | |
48 "addl %%eax, %2 \n\t" | |
49 "subl $4, %0 \n\t" | |
50 "jnz 1b \n\t" | |
51 :"+g"(h), "+S"(pixels), "+D"(block) | |
52 :"r" (line_size) | |
53 :"%eax", "memory"); | |
54 } | |
651 | 55 |
56 static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
57 { | |
58 __asm __volatile( | |
59 "lea (%3, %3), %%eax \n\t" | |
60 "1: \n\t" | |
61 "movq (%1), %%mm0 \n\t" | |
62 "movq (%1, %3), %%mm1 \n\t" | |
63 "movq 8(%1), %%mm2 \n\t" | |
64 "movq 8(%1, %3), %%mm3 \n\t" | |
65 PAVGB" 1(%1), %%mm0 \n\t" | |
66 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
67 PAVGB" 9(%1), %%mm2 \n\t" | |
68 PAVGB" 9(%1, %3), %%mm3 \n\t" | |
69 "movq %%mm0, (%2) \n\t" | |
70 "movq %%mm1, (%2, %3) \n\t" | |
71 "movq %%mm2, 8(%2) \n\t" | |
72 "movq %%mm3, 8(%2, %3) \n\t" | |
73 "addl %%eax, %1 \n\t" | |
74 "addl %%eax, %2 \n\t" | |
75 "movq (%1), %%mm0 \n\t" | |
76 "movq (%1, %3), %%mm1 \n\t" | |
77 "movq 8(%1), %%mm2 \n\t" | |
78 "movq 8(%1, %3), %%mm3 \n\t" | |
79 PAVGB" 1(%1), %%mm0 \n\t" | |
80 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
81 PAVGB" 9(%1), %%mm2 \n\t" | |
82 PAVGB" 9(%1, %3), %%mm3 \n\t" | |
83 "addl %%eax, %1 \n\t" | |
84 "movq %%mm0, (%2) \n\t" | |
85 "movq %%mm1, (%2, %3) \n\t" | |
86 "movq %%mm2, 8(%2) \n\t" | |
87 "movq %%mm3, 8(%2, %3) \n\t" | |
88 "addl %%eax, %2 \n\t" | |
89 "subl $4, %0 \n\t" | |
90 "jnz 1b \n\t" | |
91 :"+g"(h), "+S"(pixels), "+D"(block) | |
92 :"r" (line_size) | |
93 :"%eax", "memory"); | |
94 } | |
441 | 95 |
96 /* GL: this function does incorrect rounding if overflow */ | |
651 | 97 static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
441 | 98 { |
448 | 99 MOVQ_BONE(mm6); |
441 | 100 __asm __volatile( |
101 "lea (%3, %3), %%eax \n\t" | |
439 | 102 "1: \n\t" |
103 "movq (%1), %%mm0 \n\t" | |
104 "movq (%1, %3), %%mm2 \n\t" | |
105 "movq 1(%1), %%mm1 \n\t" | |
106 "movq 1(%1, %3), %%mm3 \n\t" | |
441 | 107 "addl %%eax, %1 \n\t" |
448 | 108 "psubusb %%mm6, %%mm0 \n\t" |
109 "psubusb %%mm6, %%mm2 \n\t" | |
386 | 110 PAVGB" %%mm1, %%mm0 \n\t" |
111 PAVGB" %%mm3, %%mm2 \n\t" | |
439 | 112 "movq %%mm0, (%2) \n\t" |
113 "movq %%mm2, (%2, %3) \n\t" | |
114 "movq (%1), %%mm0 \n\t" | |
115 "movq 1(%1), %%mm1 \n\t" | |
116 "movq (%1, %3), %%mm2 \n\t" | |
117 "movq 1(%1, %3), %%mm3 \n\t" | |
441 | 118 "addl %%eax, %2 \n\t" |
439 | 119 "addl %%eax, %1 \n\t" |
448 | 120 "psubusb %%mm6, %%mm0 \n\t" |
121 "psubusb %%mm6, %%mm2 \n\t" | |
386 | 122 PAVGB" %%mm1, %%mm0 \n\t" |
123 PAVGB" %%mm3, %%mm2 \n\t" | |
439 | 124 "movq %%mm0, (%2) \n\t" |
125 "movq %%mm2, (%2, %3) \n\t" | |
441 | 126 "addl %%eax, %2 \n\t" |
127 "subl $4, %0 \n\t" | |
128 "jnz 1b \n\t" | |
129 :"+g"(h), "+S"(pixels), "+D"(block) | |
130 :"r" (line_size) | |
386 | 131 :"%eax", "memory"); |
0 | 132 } |
133 | |
651 | 134 static void DEF(put_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
0 | 135 { |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
136 __asm __volatile( |
441 | 137 "lea (%3, %3), %%eax \n\t" |
138 "movq (%1), %%mm0 \n\t" | |
139 "subl %3, %2 \n\t" | |
140 "1: \n\t" | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
141 "movq (%1, %3), %%mm1 \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
142 "movq (%1, %%eax), %%mm2 \n\t" |
441 | 143 "addl %%eax, %1 \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
144 PAVGB" %%mm1, %%mm0 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
145 PAVGB" %%mm2, %%mm1 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
146 "movq %%mm0, (%2, %3) \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
147 "movq %%mm1, (%2, %%eax) \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
148 "movq (%1, %3), %%mm1 \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
149 "movq (%1, %%eax), %%mm0 \n\t" |
441 | 150 "addl %%eax, %2 \n\t" |
151 "addl %%eax, %1 \n\t" | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
152 PAVGB" %%mm1, %%mm2 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
153 PAVGB" %%mm0, %%mm1 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
154 "movq %%mm2, (%2, %3) \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
155 "movq %%mm1, (%2, %%eax) \n\t" |
441 | 156 "addl %%eax, %2 \n\t" |
157 "subl $4, %0 \n\t" | |
158 "jnz 1b \n\t" | |
159 :"+g"(h), "+S"(pixels), "+D" (block) | |
160 :"r" (line_size) | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
161 :"%eax", "memory"); |
386 | 162 } |
163 | |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
164 /* GL: this function does incorrect rounding if overflow */ |
651 | 165 static void DEF(put_no_rnd_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
386 | 166 { |
448 | 167 MOVQ_BONE(mm6); |
0 | 168 __asm __volatile( |
441 | 169 "lea (%3, %3), %%eax \n\t" |
170 "movq (%1), %%mm0 \n\t" | |
171 "subl %3, %2 \n\t" | |
172 "1: \n\t" | |
439 | 173 "movq (%1, %3), %%mm1 \n\t" |
174 "movq (%1, %%eax), %%mm2 \n\t" | |
441 | 175 "addl %%eax, %1 \n\t" |
448 | 176 "psubusb %%mm6, %%mm1 \n\t" |
386 | 177 PAVGB" %%mm1, %%mm0 \n\t" |
178 PAVGB" %%mm2, %%mm1 \n\t" | |
439 | 179 "movq %%mm0, (%2, %3) \n\t" |
180 "movq %%mm1, (%2, %%eax) \n\t" | |
181 "movq (%1, %3), %%mm1 \n\t" | |
182 "movq (%1, %%eax), %%mm0 \n\t" | |
441 | 183 "addl %%eax, %2 \n\t" |
184 "addl %%eax, %1 \n\t" | |
448 | 185 "psubusb %%mm6, %%mm1 \n\t" |
386 | 186 PAVGB" %%mm1, %%mm2 \n\t" |
187 PAVGB" %%mm0, %%mm1 \n\t" | |
439 | 188 "movq %%mm2, (%2, %3) \n\t" |
189 "movq %%mm1, (%2, %%eax) \n\t" | |
441 | 190 "addl %%eax, %2 \n\t" |
191 "subl $4, %0 \n\t" | |
192 "jnz 1b \n\t" | |
193 :"+g"(h), "+S"(pixels), "+D" (block) | |
194 :"r" (line_size) | |
439 | 195 :"%eax", "memory"); |
0 | 196 } |
197 | |
651 | 198 static void DEF(avg_pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
0 | 199 { |
200 __asm __volatile( | |
441 | 201 "lea (%3, %3), %%eax \n\t" |
202 "1: \n\t" | |
203 "movq (%2), %%mm0 \n\t" | |
204 "movq (%2, %3), %%mm1 \n\t" | |
205 PAVGB" (%1), %%mm0 \n\t" | |
206 PAVGB" (%1, %3), %%mm1 \n\t" | |
207 "movq %%mm0, (%2) \n\t" | |
208 "movq %%mm1, (%2, %3) \n\t" | |
209 "addl %%eax, %1 \n\t" | |
210 "addl %%eax, %2 \n\t" | |
211 "movq (%2), %%mm0 \n\t" | |
212 "movq (%2, %3), %%mm1 \n\t" | |
213 PAVGB" (%1), %%mm0 \n\t" | |
214 PAVGB" (%1, %3), %%mm1 \n\t" | |
215 "addl %%eax, %1 \n\t" | |
216 "movq %%mm0, (%2) \n\t" | |
217 "movq %%mm1, (%2, %3) \n\t" | |
218 "addl %%eax, %2 \n\t" | |
219 "subl $4, %0 \n\t" | |
220 "jnz 1b \n\t" | |
221 :"+g"(h), "+S"(pixels), "+D"(block) | |
222 :"r" (line_size) | |
386 | 223 :"%eax", "memory"); |
0 | 224 } |
225 | |
651 | 226 static void DEF(avg_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
0 | 227 { |
228 __asm __volatile( | |
441 | 229 "lea (%3, %3), %%eax \n\t" |
230 "1: \n\t" | |
231 "movq (%1), %%mm0 \n\t" | |
232 "movq (%1, %3), %%mm2 \n\t" | |
233 PAVGB" 1(%1), %%mm0 \n\t" | |
234 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
235 PAVGB" (%2), %%mm0 \n\t" | |
236 PAVGB" (%2, %3), %%mm2 \n\t" | |
237 "addl %%eax, %1 \n\t" | |
238 "movq %%mm0, (%2) \n\t" | |
239 "movq %%mm2, (%2, %3) \n\t" | |
240 "movq (%1), %%mm0 \n\t" | |
241 "movq (%1, %3), %%mm2 \n\t" | |
242 PAVGB" 1(%1), %%mm0 \n\t" | |
243 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
244 "addl %%eax, %2 \n\t" | |
245 "addl %%eax, %1 \n\t" | |
246 PAVGB" (%2), %%mm0 \n\t" | |
247 PAVGB" (%2, %3), %%mm2 \n\t" | |
248 "movq %%mm0, (%2) \n\t" | |
249 "movq %%mm2, (%2, %3) \n\t" | |
250 "addl %%eax, %2 \n\t" | |
251 "subl $4, %0 \n\t" | |
252 "jnz 1b \n\t" | |
253 :"+g"(h), "+S"(pixels), "+D"(block) | |
254 :"r" (line_size) | |
386 | 255 :"%eax", "memory"); |
0 | 256 } |
257 | |
651 | 258 static void DEF(avg_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
0 | 259 { |
260 __asm __volatile( | |
441 | 261 "lea (%3, %3), %%eax \n\t" |
386 | 262 "movq (%1), %%mm0 \n\t" |
441 | 263 "subl %3, %2 \n\t" |
264 "1: \n\t" | |
265 "movq (%1, %3), %%mm1 \n\t" | |
266 "movq (%1, %%eax), %%mm2 \n\t" | |
267 "addl %%eax, %1 \n\t" | |
386 | 268 PAVGB" %%mm1, %%mm0 \n\t" |
269 PAVGB" %%mm2, %%mm1 \n\t" | |
441 | 270 "movq (%2, %3), %%mm3 \n\t" |
271 "movq (%2, %%eax), %%mm4 \n\t" | |
386 | 272 PAVGB" %%mm3, %%mm0 \n\t" |
273 PAVGB" %%mm4, %%mm1 \n\t" | |
441 | 274 "movq %%mm0, (%2, %3) \n\t" |
275 "movq %%mm1, (%2, %%eax) \n\t" | |
276 "movq (%1, %3), %%mm1 \n\t" | |
277 "movq (%1, %%eax), %%mm0 \n\t" | |
386 | 278 PAVGB" %%mm1, %%mm2 \n\t" |
279 PAVGB" %%mm0, %%mm1 \n\t" | |
441 | 280 "addl %%eax, %2 \n\t" |
281 "addl %%eax, %1 \n\t" | |
282 "movq (%2, %3), %%mm3 \n\t" | |
283 "movq (%2, %%eax), %%mm4 \n\t" | |
386 | 284 PAVGB" %%mm3, %%mm2 \n\t" |
285 PAVGB" %%mm4, %%mm1 \n\t" | |
441 | 286 "movq %%mm2, (%2, %3) \n\t" |
287 "movq %%mm1, (%2, %%eax) \n\t" | |
288 "addl %%eax, %2 \n\t" | |
289 "subl $4, %0 \n\t" | |
290 "jnz 1b \n\t" | |
291 :"+g"(h), "+S"(pixels), "+D"(block) | |
292 :"r" (line_size) | |
293 :"%eax", "memory"); | |
0 | 294 } |
295 | |
386 | 296 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter |
651 | 297 static void DEF(avg_pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
0 | 298 { |
448 | 299 MOVQ_BONE(mm6); |
0 | 300 __asm __volatile( |
442 | 301 "lea (%3, %3), %%eax \n\t" |
386 | 302 "movq (%1), %%mm0 \n\t" |
442 | 303 PAVGB" 1(%1), %%mm0 \n\t" |
304 ".balign 8 \n\t" | |
441 | 305 "1: \n\t" |
442 | 306 "movq (%1, %%eax), %%mm2 \n\t" |
307 "movq (%1, %3), %%mm1 \n\t" | |
448 | 308 "psubusb %%mm6, %%mm2 \n\t" |
442 | 309 PAVGB" 1(%1, %3), %%mm1 \n\t" |
310 PAVGB" 1(%1, %%eax), %%mm2 \n\t" | |
311 "addl %%eax, %1 \n\t" | |
386 | 312 PAVGB" %%mm1, %%mm0 \n\t" |
313 PAVGB" %%mm2, %%mm1 \n\t" | |
442 | 314 PAVGB" (%2), %%mm0 \n\t" |
315 PAVGB" (%2, %3), %%mm1 \n\t" | |
316 "movq %%mm0, (%2) \n\t" | |
317 "movq %%mm1, (%2, %3) \n\t" | |
318 "movq (%1, %3), %%mm1 \n\t" | |
319 "movq (%1, %%eax), %%mm0 \n\t" | |
320 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
321 PAVGB" 1(%1, %%eax), %%mm0 \n\t" | |
322 "addl %%eax, %2 \n\t" | |
323 "addl %%eax, %1 \n\t" | |
386 | 324 PAVGB" %%mm1, %%mm2 \n\t" |
325 PAVGB" %%mm0, %%mm1 \n\t" | |
442 | 326 PAVGB" (%2), %%mm2 \n\t" |
327 PAVGB" (%2, %3), %%mm1 \n\t" | |
328 "movq %%mm2, (%2) \n\t" | |
329 "movq %%mm1, (%2, %3) \n\t" | |
330 "addl %%eax, %2 \n\t" | |
441 | 331 "subl $4, %0 \n\t" |
442 | 332 "jnz 1b \n\t" |
443 | 333 :"+g"(h), "+S"(pixels), "+D"(block) |
442 | 334 :"r" (line_size) |
386 | 335 :"%eax", "memory"); |
0 | 336 } |
651 | 337 |
338 //FIXME the following could be optimized too ... | |
339 static void DEF(put_no_rnd_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ | |
340 DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); | |
341 DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); | |
342 } | |
343 static void DEF(put_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ | |
344 DEF(put_pixels8_y2)(block , pixels , line_size, h); | |
345 DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h); | |
346 } | |
347 static void DEF(put_no_rnd_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ | |
348 DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h); | |
349 DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); | |
350 } | |
351 static void DEF(avg_pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ | |
352 DEF(avg_pixels8)(block , pixels , line_size, h); | |
353 DEF(avg_pixels8)(block+8, pixels+8, line_size, h); | |
354 } | |
355 static void DEF(avg_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ | |
356 DEF(avg_pixels8_x2)(block , pixels , line_size, h); | |
357 DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h); | |
358 } | |
359 static void DEF(avg_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ | |
360 DEF(avg_pixels8_y2)(block , pixels , line_size, h); | |
361 DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h); | |
362 } | |
363 static void DEF(avg_pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ | |
364 DEF(avg_pixels8_xy2)(block , pixels , line_size, h); | |
365 DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); | |
366 } | |
367 |