Mercurial > libavcodec.hg
annotate i386/dsputil_mmx_avg.h @ 442:006965950f49 libavcodec
* optimized remaing avg_pixels_xy2
author | kabi |
---|---|
date | Wed, 29 May 2002 08:31:22 +0000 |
parents | c0de4d3c7d3c |
children | 63467327c06c |
rev | line source |
---|---|
0 | 1 /* |
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2 | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
386 | 4 * Copyright (c) 2002 Michael Niedermayer |
0 | 5 * |
429 | 6 * This library is free software; you can redistribute it and/or |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2 of the License, or (at your option) any later version. | |
0 | 10 * |
429 | 11 * This library is distributed in the hope that it will be useful, |
0 | 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 * Lesser General Public License for more details. | |
0 | 15 * |
429 | 16 * You should have received a copy of the GNU Lesser General Public |
17 * License along with this library; if not, write to the Free Software | |
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
0 | 19 * |
20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
386 | 21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> |
441 | 22 * and improved by Zdenek Kabelac <kabi@users.sf.net> |
0 | 23 */ |
387 | 24 |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
25 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
26 clobber bug - now it will work with 2.95.2 and also with -fPIC |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
27 */ |
0 | 28 static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
29 { | |
386 | 30 __asm __volatile( |
441 | 31 "lea (%3, %3), %%eax \n\t" |
32 "1: \n\t" | |
33 "movq (%1), %%mm0 \n\t" | |
34 "movq (%1, %3), %%mm1 \n\t" | |
35 PAVGB" 1(%1), %%mm0 \n\t" | |
36 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
37 "movq %%mm0, (%2) \n\t" | |
38 "movq %%mm1, (%2, %3) \n\t" | |
39 "addl %%eax, %1 \n\t" | |
40 "addl %%eax, %2 \n\t" | |
41 "movq (%1), %%mm0 \n\t" | |
42 "movq (%1, %3), %%mm1 \n\t" | |
43 PAVGB" 1(%1), %%mm0 \n\t" | |
44 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
45 "addl %%eax, %1 \n\t" | |
46 "movq %%mm0, (%2) \n\t" | |
47 "movq %%mm1, (%2, %3) \n\t" | |
48 "addl %%eax, %2 \n\t" | |
49 "subl $4, %0 \n\t" | |
50 "jnz 1b \n\t" | |
51 :"+g"(h), "+S"(pixels), "+D"(block) | |
52 :"r" (line_size) | |
53 :"%eax", "memory"); | |
54 } | |
55 | |
56 /* GL: this function does incorrect rounding if overflow */ | |
57 static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
58 { | |
59 __asm __volatile( | |
60 "lea (%3, %3), %%eax \n\t" | |
61 MOVQ_BONE(%%mm7) | |
439 | 62 "1: \n\t" |
63 "movq (%1), %%mm0 \n\t" | |
64 "movq (%1, %3), %%mm2 \n\t" | |
65 "movq 1(%1), %%mm1 \n\t" | |
66 "movq 1(%1, %3), %%mm3 \n\t" | |
441 | 67 "addl %%eax, %1 \n\t" |
68 "psubusb %%mm7, %%mm0 \n\t" | |
69 "psubusb %%mm7, %%mm2 \n\t" | |
386 | 70 PAVGB" %%mm1, %%mm0 \n\t" |
71 PAVGB" %%mm3, %%mm2 \n\t" | |
439 | 72 "movq %%mm0, (%2) \n\t" |
73 "movq %%mm2, (%2, %3) \n\t" | |
74 "movq (%1), %%mm0 \n\t" | |
75 "movq 1(%1), %%mm1 \n\t" | |
76 "movq (%1, %3), %%mm2 \n\t" | |
77 "movq 1(%1, %3), %%mm3 \n\t" | |
441 | 78 "addl %%eax, %2 \n\t" |
439 | 79 "addl %%eax, %1 \n\t" |
441 | 80 "psubusb %%mm7, %%mm0 \n\t" |
81 "psubusb %%mm7, %%mm2 \n\t" | |
386 | 82 PAVGB" %%mm1, %%mm0 \n\t" |
83 PAVGB" %%mm3, %%mm2 \n\t" | |
439 | 84 "movq %%mm0, (%2) \n\t" |
85 "movq %%mm2, (%2, %3) \n\t" | |
441 | 86 "addl %%eax, %2 \n\t" |
87 "subl $4, %0 \n\t" | |
88 "jnz 1b \n\t" | |
89 :"+g"(h), "+S"(pixels), "+D"(block) | |
90 :"r" (line_size) | |
386 | 91 :"%eax", "memory"); |
0 | 92 } |
93 | |
94 static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
95 { | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
96 __asm __volatile( |
441 | 97 "lea (%3, %3), %%eax \n\t" |
98 "movq (%1), %%mm0 \n\t" | |
99 "subl %3, %2 \n\t" | |
100 "1: \n\t" | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
101 "movq (%1, %3), %%mm1 \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
102 "movq (%1, %%eax), %%mm2 \n\t" |
441 | 103 "addl %%eax, %1 \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
104 PAVGB" %%mm1, %%mm0 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
105 PAVGB" %%mm2, %%mm1 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
106 "movq %%mm0, (%2, %3) \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
107 "movq %%mm1, (%2, %%eax) \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
108 "movq (%1, %3), %%mm1 \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
109 "movq (%1, %%eax), %%mm0 \n\t" |
441 | 110 "addl %%eax, %2 \n\t" |
111 "addl %%eax, %1 \n\t" | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
112 PAVGB" %%mm1, %%mm2 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
113 PAVGB" %%mm0, %%mm1 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
114 "movq %%mm2, (%2, %3) \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
115 "movq %%mm1, (%2, %%eax) \n\t" |
441 | 116 "addl %%eax, %2 \n\t" |
117 "subl $4, %0 \n\t" | |
118 "jnz 1b \n\t" | |
119 :"+g"(h), "+S"(pixels), "+D" (block) | |
120 :"r" (line_size) | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
121 :"%eax", "memory"); |
386 | 122 } |
123 | |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
124 /* GL: this function does incorrect rounding if overflow */ |
386 | 125 static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
126 { | |
0 | 127 __asm __volatile( |
387 | 128 MOVQ_BONE(%%mm7) |
441 | 129 "lea (%3, %3), %%eax \n\t" |
130 "movq (%1), %%mm0 \n\t" | |
131 "subl %3, %2 \n\t" | |
132 "1: \n\t" | |
439 | 133 "movq (%1, %3), %%mm1 \n\t" |
134 "movq (%1, %%eax), %%mm2 \n\t" | |
441 | 135 "addl %%eax, %1 \n\t" |
136 "psubusb %%mm7, %%mm1 \n\t" | |
386 | 137 PAVGB" %%mm1, %%mm0 \n\t" |
138 PAVGB" %%mm2, %%mm1 \n\t" | |
439 | 139 "movq %%mm0, (%2, %3) \n\t" |
140 "movq %%mm1, (%2, %%eax) \n\t" | |
141 "movq (%1, %3), %%mm1 \n\t" | |
142 "movq (%1, %%eax), %%mm0 \n\t" | |
441 | 143 "addl %%eax, %2 \n\t" |
144 "addl %%eax, %1 \n\t" | |
145 "psubusb %%mm7, %%mm1 \n\t" | |
386 | 146 PAVGB" %%mm1, %%mm2 \n\t" |
147 PAVGB" %%mm0, %%mm1 \n\t" | |
439 | 148 "movq %%mm2, (%2, %3) \n\t" |
149 "movq %%mm1, (%2, %%eax) \n\t" | |
441 | 150 "addl %%eax, %2 \n\t" |
151 "subl $4, %0 \n\t" | |
152 "jnz 1b \n\t" | |
153 :"+g"(h), "+S"(pixels), "+D" (block) | |
154 :"r" (line_size) | |
439 | 155 :"%eax", "memory"); |
0 | 156 } |
157 | |
158 static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
159 { | |
160 __asm __volatile( | |
441 | 161 "lea (%3, %3), %%eax \n\t" |
162 "1: \n\t" | |
163 "movq (%2), %%mm0 \n\t" | |
164 "movq (%2, %3), %%mm1 \n\t" | |
165 PAVGB" (%1), %%mm0 \n\t" | |
166 PAVGB" (%1, %3), %%mm1 \n\t" | |
167 "movq %%mm0, (%2) \n\t" | |
168 "movq %%mm1, (%2, %3) \n\t" | |
169 "addl %%eax, %1 \n\t" | |
170 "addl %%eax, %2 \n\t" | |
171 "movq (%2), %%mm0 \n\t" | |
172 "movq (%2, %3), %%mm1 \n\t" | |
173 PAVGB" (%1), %%mm0 \n\t" | |
174 PAVGB" (%1, %3), %%mm1 \n\t" | |
175 "addl %%eax, %1 \n\t" | |
176 "movq %%mm0, (%2) \n\t" | |
177 "movq %%mm1, (%2, %3) \n\t" | |
178 "addl %%eax, %2 \n\t" | |
179 "subl $4, %0 \n\t" | |
180 "jnz 1b \n\t" | |
181 :"+g"(h), "+S"(pixels), "+D"(block) | |
182 :"r" (line_size) | |
386 | 183 :"%eax", "memory"); |
0 | 184 } |
185 | |
386 | 186 static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
0 | 187 { |
188 __asm __volatile( | |
441 | 189 "lea (%3, %3), %%eax \n\t" |
190 "1: \n\t" | |
191 "movq (%1), %%mm0 \n\t" | |
192 "movq (%1, %3), %%mm2 \n\t" | |
193 PAVGB" 1(%1), %%mm0 \n\t" | |
194 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
195 PAVGB" (%2), %%mm0 \n\t" | |
196 PAVGB" (%2, %3), %%mm2 \n\t" | |
197 "addl %%eax, %1 \n\t" | |
198 "movq %%mm0, (%2) \n\t" | |
199 "movq %%mm2, (%2, %3) \n\t" | |
200 "movq (%1), %%mm0 \n\t" | |
201 "movq (%1, %3), %%mm2 \n\t" | |
202 PAVGB" 1(%1), %%mm0 \n\t" | |
203 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
204 "addl %%eax, %2 \n\t" | |
205 "addl %%eax, %1 \n\t" | |
206 PAVGB" (%2), %%mm0 \n\t" | |
207 PAVGB" (%2, %3), %%mm2 \n\t" | |
208 "movq %%mm0, (%2) \n\t" | |
209 "movq %%mm2, (%2, %3) \n\t" | |
210 "addl %%eax, %2 \n\t" | |
211 "subl $4, %0 \n\t" | |
212 "jnz 1b \n\t" | |
213 :"+g"(h), "+S"(pixels), "+D"(block) | |
214 :"r" (line_size) | |
386 | 215 :"%eax", "memory"); |
0 | 216 } |
217 | |
386 | 218 static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
0 | 219 { |
220 __asm __volatile( | |
441 | 221 "lea (%3, %3), %%eax \n\t" |
386 | 222 "movq (%1), %%mm0 \n\t" |
441 | 223 "subl %3, %2 \n\t" |
224 "1: \n\t" | |
225 "movq (%1, %3), %%mm1 \n\t" | |
226 "movq (%1, %%eax), %%mm2 \n\t" | |
227 "addl %%eax, %1 \n\t" | |
386 | 228 PAVGB" %%mm1, %%mm0 \n\t" |
229 PAVGB" %%mm2, %%mm1 \n\t" | |
441 | 230 "movq (%2, %3), %%mm3 \n\t" |
231 "movq (%2, %%eax), %%mm4 \n\t" | |
386 | 232 PAVGB" %%mm3, %%mm0 \n\t" |
233 PAVGB" %%mm4, %%mm1 \n\t" | |
441 | 234 "movq %%mm0, (%2, %3) \n\t" |
235 "movq %%mm1, (%2, %%eax) \n\t" | |
236 "movq (%1, %3), %%mm1 \n\t" | |
237 "movq (%1, %%eax), %%mm0 \n\t" | |
386 | 238 PAVGB" %%mm1, %%mm2 \n\t" |
239 PAVGB" %%mm0, %%mm1 \n\t" | |
441 | 240 "addl %%eax, %2 \n\t" |
241 "addl %%eax, %1 \n\t" | |
242 "movq (%2, %3), %%mm3 \n\t" | |
243 "movq (%2, %%eax), %%mm4 \n\t" | |
386 | 244 PAVGB" %%mm3, %%mm2 \n\t" |
245 PAVGB" %%mm4, %%mm1 \n\t" | |
441 | 246 "movq %%mm2, (%2, %3) \n\t" |
247 "movq %%mm1, (%2, %%eax) \n\t" | |
248 "addl %%eax, %2 \n\t" | |
249 "subl $4, %0 \n\t" | |
250 "jnz 1b \n\t" | |
251 :"+g"(h), "+S"(pixels), "+D"(block) | |
252 :"r" (line_size) | |
253 :"%eax", "memory"); | |
0 | 254 } |
255 | |
386 | 256 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter |
257 static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
0 | 258 { |
259 __asm __volatile( | |
387 | 260 MOVQ_BONE(%%mm7) |
442 | 261 "lea (%3, %3), %%eax \n\t" |
386 | 262 "movq (%1), %%mm0 \n\t" |
442 | 263 PAVGB" 1(%1), %%mm0 \n\t" |
264 ".balign 8 \n\t" | |
441 | 265 "1: \n\t" |
442 | 266 "movq (%1, %%eax), %%mm2 \n\t" |
267 "movq (%1, %3), %%mm1 \n\t" | |
441 | 268 "psubusb %%mm7, %%mm2 \n\t" |
442 | 269 PAVGB" 1(%1, %3), %%mm1 \n\t" |
270 PAVGB" 1(%1, %%eax), %%mm2 \n\t" | |
271 "addl %%eax, %1 \n\t" | |
386 | 272 PAVGB" %%mm1, %%mm0 \n\t" |
273 PAVGB" %%mm2, %%mm1 \n\t" | |
442 | 274 PAVGB" (%2), %%mm0 \n\t" |
275 PAVGB" (%2, %3), %%mm1 \n\t" | |
276 "movq %%mm0, (%2) \n\t" | |
277 "movq %%mm1, (%2, %3) \n\t" | |
278 "movq (%1, %3), %%mm1 \n\t" | |
279 "movq (%1, %%eax), %%mm0 \n\t" | |
280 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
281 PAVGB" 1(%1, %%eax), %%mm0 \n\t" | |
282 "addl %%eax, %2 \n\t" | |
283 "addl %%eax, %1 \n\t" | |
386 | 284 PAVGB" %%mm1, %%mm2 \n\t" |
285 PAVGB" %%mm0, %%mm1 \n\t" | |
442 | 286 PAVGB" (%2), %%mm2 \n\t" |
287 PAVGB" (%2, %3), %%mm1 \n\t" | |
288 "movq %%mm2, (%2) \n\t" | |
289 "movq %%mm1, (%2, %3) \n\t" | |
290 "addl %%eax, %2 \n\t" | |
441 | 291 "subl $4, %0 \n\t" |
442 | 292 "jnz 1b \n\t" |
293 :"+g"(h), "+D"(pixels), "+S"(block) | |
294 :"r" (line_size) | |
386 | 295 :"%eax", "memory"); |
0 | 296 } |