Mercurial > libavcodec.hg
annotate i386/dsputil_mmx_avg.h @ 439:6ae275655a23 libavcodec
* more PIC friendly and faster code
author | kabi |
---|---|
date | Mon, 27 May 2002 14:09:10 +0000 |
parents | 718a22dc121f |
children | c0de4d3c7d3c |
rev | line source |
---|---|
0 | 1 /* |
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2 | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
386 | 4 * Copyright (c) 2002 Michael Niedermayer |
0 | 5 * |
429 | 6 * This library is free software; you can redistribute it and/or |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2 of the License, or (at your option) any later version. | |
0 | 10 * |
429 | 11 * This library is distributed in the hope that it will be useful, |
0 | 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 * Lesser General Public License for more details. | |
0 | 15 * |
429 | 16 * You should have received a copy of the GNU Lesser General Public |
17 * License along with this library; if not, write to the Free Software | |
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
0 | 19 * |
20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
386 | 21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> |
0 | 22 */ |
387 | 23 |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
24 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
25 clobber bug - now it will work with 2.95.2 and also with -fPIC |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
26 */ |
0 | 27 static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
28 { | |
386 | 29 __asm __volatile( |
439 | 30 "lea (%3, %3), %%eax \n\t" |
31 "1: \n\t" | |
32 "movq (%1), %%mm0 \n\t" | |
33 "movq (%1, %3), %%mm2 \n\t" | |
34 "movq 1(%1), %%mm1 \n\t" | |
35 "movq 1(%1, %3), %%mm3 \n\t" | |
36 "addl %%eax, %1 \n\t" | |
386 | 37 PAVGB" %%mm1, %%mm0 \n\t" |
38 PAVGB" %%mm3, %%mm2 \n\t" | |
439 | 39 "movq %%mm0, (%2) \n\t" |
40 "movq %%mm2, (%2, %3) \n\t" | |
41 "movq (%1), %%mm0 \n\t" | |
42 "movq 1(%1), %%mm1 \n\t" | |
43 "movq (%1, %3), %%mm2 \n\t" | |
44 "movq 1(%1, %3), %%mm3 \n\t" | |
45 "addl %%eax, %2 \n\t" | |
46 "addl %%eax, %1 \n\t" | |
386 | 47 PAVGB" %%mm1, %%mm0 \n\t" |
48 PAVGB" %%mm3, %%mm2 \n\t" | |
439 | 49 "movq %%mm0, (%2) \n\t" |
50 "movq %%mm2, (%2, %3) \n\t" | |
51 "addl %%eax, %2 \n\t" | |
386 | 52 "subl $4, %0 \n\t" |
53 " jnz 1b \n\t" | |
439 | 54 :"+g"(h), "+S"(pixels), "+D"(block) |
55 :"c" (line_size) | |
386 | 56 :"%eax", "memory"); |
57 } | |
58 | |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
59 /* GL: this function does incorrect rounding if overflow */ |
386 | 60 static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
61 { | |
0 | 62 __asm __volatile( |
439 | 63 "lea (%3, %3), %%eax \n\t" |
387 | 64 MOVQ_BONE(%%mm7) |
386 | 65 "1: \n\t" |
439 | 66 "movq (%1), %%mm0 \n\t" |
67 "movq (%1, %3), %%mm2 \n\t" | |
68 "movq 1(%1), %%mm1 \n\t" | |
69 "movq 1(%1, %3), %%mm3 \n\t" | |
70 "addl %%eax, %1 \n\t" | |
386 | 71 "psubusb %%mm7, %%mm0 \n\t" |
72 "psubusb %%mm7, %%mm2 \n\t" | |
73 PAVGB" %%mm1, %%mm0 \n\t" | |
74 PAVGB" %%mm3, %%mm2 \n\t" | |
439 | 75 "movq %%mm0, (%2) \n\t" |
76 "movq %%mm2, (%2, %3) \n\t" | |
77 "movq (%1), %%mm0 \n\t" | |
78 "movq 1(%1), %%mm1 \n\t" | |
79 "movq (%1, %3), %%mm2 \n\t" | |
80 "movq 1(%1, %3), %%mm3 \n\t" | |
81 "addl %%eax, %2 \n\t" | |
82 "addl %%eax, %1 \n\t" | |
386 | 83 "psubusb %%mm7, %%mm0 \n\t" |
84 "psubusb %%mm7, %%mm2 \n\t" | |
85 PAVGB" %%mm1, %%mm0 \n\t" | |
86 PAVGB" %%mm3, %%mm2 \n\t" | |
439 | 87 "movq %%mm0, (%2) \n\t" |
88 "movq %%mm2, (%2, %3) \n\t" | |
89 "addl %%eax, %2 \n\t" | |
386 | 90 "subl $4, %0 \n\t" |
439 | 91 "jnz 1b \n\t" |
92 :"+g"(h), "+S"(pixels), "+D"(block) | |
93 :"c" (line_size) | |
386 | 94 :"%eax", "memory"); |
0 | 95 } |
96 | |
97 static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
98 { | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
99 __asm __volatile( |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
100 "lea (%3, %3), %%eax \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
101 "movq (%1), %%mm0 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
102 "subl %3, %2 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
103 "1: \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
104 "movq (%1, %3), %%mm1 \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
105 "movq (%1, %%eax), %%mm2 \n\t" |
439 | 106 "addl %%eax, %1 \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
107 PAVGB" %%mm1, %%mm0 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
108 PAVGB" %%mm2, %%mm1 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
109 "movq %%mm0, (%2, %3) \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
110 "movq %%mm1, (%2, %%eax) \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
111 "movq (%1, %3), %%mm1 \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
112 "movq (%1, %%eax), %%mm0 \n\t" |
439 | 113 "addl %%eax, %2 \n\t" |
114 "addl %%eax, %1 \n\t" | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
115 PAVGB" %%mm1, %%mm2 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
116 PAVGB" %%mm0, %%mm1 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
117 "movq %%mm2, (%2, %3) \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
118 "movq %%mm1, (%2, %%eax) \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
119 "addl %%eax, %2 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
120 "subl $4, %0 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
121 "jnz 1b \n\t" |
439 | 122 :"+g"(h), "+S"(pixels), "+D" (block) |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
123 :"c"(line_size) |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
124 :"%eax", "memory"); |
386 | 125 } |
126 | |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
127 /* GL: this function does incorrect rounding if overflow */ |
386 | 128 static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
129 { | |
0 | 130 __asm __volatile( |
387 | 131 MOVQ_BONE(%%mm7) |
439 | 132 "lea (%3, %3), %%eax \n\t" |
133 "movq (%1), %%mm0 \n\t" | |
134 "subl %3, %2 \n\t" | |
386 | 135 "1: \n\t" |
439 | 136 "movq (%1, %3), %%mm1 \n\t" |
137 "movq (%1, %%eax), %%mm2 \n\t" | |
138 "addl %%eax, %1 \n\t" | |
386 | 139 "psubusb %%mm7, %%mm1 \n\t" |
140 PAVGB" %%mm1, %%mm0 \n\t" | |
141 PAVGB" %%mm2, %%mm1 \n\t" | |
439 | 142 "movq %%mm0, (%2, %3) \n\t" |
143 "movq %%mm1, (%2, %%eax) \n\t" | |
144 "movq (%1, %3), %%mm1 \n\t" | |
145 "movq (%1, %%eax), %%mm0 \n\t" | |
146 "addl %%eax, %2 \n\t" | |
147 "addl %%eax, %1 \n\t" | |
386 | 148 "psubusb %%mm7, %%mm1 \n\t" |
149 PAVGB" %%mm1, %%mm2 \n\t" | |
150 PAVGB" %%mm0, %%mm1 \n\t" | |
439 | 151 "movq %%mm2, (%2, %3) \n\t" |
152 "movq %%mm1, (%2, %%eax) \n\t" | |
153 "addl %%eax, %2 \n\t" | |
386 | 154 "subl $4, %0 \n\t" |
439 | 155 "jnz 1b \n\t" |
156 :"+g"(h), "+S"(pixels), "+D" (block) | |
157 :"c"(line_size) | |
158 :"%eax", "memory"); | |
0 | 159 } |
160 | |
161 static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
162 { | |
163 __asm __volatile( | |
386 | 164 "xorl %%eax, %%eax \n\t" |
165 ".balign 16 \n\t" | |
166 "1: \n\t" | |
167 "movq (%1, %%eax), %%mm0 \n\t" | |
168 "movq (%2, %%eax), %%mm2 \n\t" | |
169 "movq (%3, %%eax), %%mm3 \n\t" | |
170 "movq (%4, %%eax), %%mm4 \n\t" | |
171 PAVGB" %%mm3, %%mm0 \n\t" | |
172 PAVGB" %%mm4, %%mm2 \n\t" | |
173 "movq %%mm0, (%3, %%eax) \n\t" | |
174 "movq %%mm2, (%4, %%eax) \n\t" | |
175 "addl %5, %%eax \n\t" | |
176 "movq (%1, %%eax), %%mm0 \n\t" | |
177 "movq (%2, %%eax), %%mm2 \n\t" | |
178 "movq (%3, %%eax), %%mm3 \n\t" | |
179 "movq (%4, %%eax), %%mm4 \n\t" | |
180 PAVGB" %%mm3, %%mm0 \n\t" | |
181 PAVGB" %%mm4, %%mm2 \n\t" | |
182 "movq %%mm0, (%3, %%eax) \n\t" | |
183 "movq %%mm2, (%4, %%eax) \n\t" | |
184 "addl %5, %%eax \n\t" | |
185 "subl $4, %0 \n\t" | |
186 " jnz 1b \n\t" | |
187 :"+g"(h) | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
188 :"D"(pixels), "S"(pixels+line_size), "r" (block), "r" (block+line_size), |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
189 "g"(line_size<<1) |
386 | 190 :"%eax", "memory"); |
0 | 191 } |
192 | |
386 | 193 static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
0 | 194 { |
195 __asm __volatile( | |
386 | 196 "xorl %%eax, %%eax \n\t" |
197 ".balign 16 \n\t" | |
198 "1: \n\t" | |
199 "movq (%1, %%eax), %%mm0 \n\t" | |
200 "movq 1(%1, %%eax), %%mm1 \n\t" | |
201 "movq (%2, %%eax), %%mm2 \n\t" | |
202 "movq 1(%2, %%eax), %%mm3 \n\t" | |
203 PAVGB" %%mm1, %%mm0 \n\t" | |
204 PAVGB" %%mm3, %%mm2 \n\t" | |
205 "movq (%3, %%eax), %%mm3 \n\t" | |
206 "movq (%4, %%eax), %%mm4 \n\t" | |
207 PAVGB" %%mm3, %%mm0 \n\t" | |
208 PAVGB" %%mm4, %%mm2 \n\t" | |
209 "movq %%mm0, (%3, %%eax) \n\t" | |
210 "movq %%mm2, (%4, %%eax) \n\t" | |
211 "addl %5, %%eax \n\t" | |
212 "movq (%1, %%eax), %%mm0 \n\t" | |
213 "movq 1(%1, %%eax), %%mm1 \n\t" | |
214 "movq (%2, %%eax), %%mm2 \n\t" | |
215 "movq 1(%2, %%eax), %%mm3 \n\t" | |
216 PAVGB" %%mm1, %%mm0 \n\t" | |
217 PAVGB" %%mm3, %%mm2 \n\t" | |
218 "movq (%3, %%eax), %%mm3 \n\t" | |
219 "movq (%4, %%eax), %%mm4 \n\t" | |
220 PAVGB" %%mm3, %%mm0 \n\t" | |
221 PAVGB" %%mm4, %%mm2 \n\t" | |
222 "movq %%mm0, (%3, %%eax) \n\t" | |
223 "movq %%mm2, (%4, %%eax) \n\t" | |
224 "addl %5, %%eax \n\t" | |
225 "subl $4, %0 \n\t" | |
226 " jnz 1b \n\t" | |
227 :"+g"(h) | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
228 :"D"(pixels), "S"(pixels+line_size), "r" (block), "r" (block+line_size), |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
229 "g"(line_size<<1) |
386 | 230 :"%eax", "memory"); |
0 | 231 } |
232 | |
386 | 233 static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
0 | 234 { |
235 __asm __volatile( | |
386 | 236 "xorl %%eax, %%eax \n\t" |
237 "movq (%1), %%mm0 \n\t" | |
238 ".balign 16 \n\t" | |
239 "1: \n\t" | |
240 "movq (%2, %%eax), %%mm1 \n\t" | |
241 "movq (%3, %%eax), %%mm2 \n\t" | |
242 PAVGB" %%mm1, %%mm0 \n\t" | |
243 PAVGB" %%mm2, %%mm1 \n\t" | |
244 "movq (%4, %%eax), %%mm3 \n\t" | |
245 "movq (%5, %%eax), %%mm4 \n\t" | |
246 PAVGB" %%mm3, %%mm0 \n\t" | |
247 PAVGB" %%mm4, %%mm1 \n\t" | |
248 "movq %%mm0, (%4, %%eax) \n\t" | |
249 "movq %%mm1, (%5, %%eax) \n\t" | |
250 "addl %6, %%eax \n\t" | |
251 "movq (%2, %%eax), %%mm1 \n\t" | |
252 "movq (%3, %%eax), %%mm0 \n\t" | |
253 PAVGB" %%mm1, %%mm2 \n\t" | |
254 PAVGB" %%mm0, %%mm1 \n\t" | |
255 "movq (%4, %%eax), %%mm3 \n\t" | |
256 "movq (%5, %%eax), %%mm4 \n\t" | |
257 PAVGB" %%mm3, %%mm2 \n\t" | |
258 PAVGB" %%mm4, %%mm1 \n\t" | |
259 "movq %%mm2, (%4, %%eax) \n\t" | |
260 "movq %%mm1, (%5, %%eax) \n\t" | |
261 "addl %6, %%eax \n\t" | |
262 "subl $4, %0 \n\t" | |
263 " jnz 1b \n\t" | |
264 :"+g"(h) | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
265 :"D"(pixels), "S"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
266 "r" (block+line_size), "g"(line_size<<1) |
386 | 267 :"%eax", "memory"); |
0 | 268 } |
269 | |
386 | 270 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter |
271 static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
0 | 272 { |
273 __asm __volatile( | |
387 | 274 MOVQ_BONE(%%mm7) |
386 | 275 "xorl %%eax, %%eax \n\t" |
276 "movq (%1), %%mm0 \n\t" | |
277 "movq 1(%1), %%mm1 \n\t" | |
278 PAVGB" %%mm1, %%mm0 \n\t" | |
279 ".balign 16 \n\t" | |
280 "1: \n\t" | |
281 "movq (%2, %%eax), %%mm1 \n\t" | |
282 "movq (%3, %%eax), %%mm2 \n\t" | |
283 "movq 1(%2, %%eax), %%mm3 \n\t" | |
284 "movq 1(%3, %%eax), %%mm4 \n\t" | |
285 "psubusb %%mm7, %%mm2 \n\t" | |
286 PAVGB" %%mm3, %%mm1 \n\t" | |
287 PAVGB" %%mm4, %%mm2 \n\t" | |
288 PAVGB" %%mm1, %%mm0 \n\t" | |
289 PAVGB" %%mm2, %%mm1 \n\t" | |
290 "movq (%4, %%eax), %%mm3 \n\t" | |
291 "movq (%5, %%eax), %%mm4 \n\t" | |
292 PAVGB" %%mm3, %%mm0 \n\t" | |
293 PAVGB" %%mm4, %%mm1 \n\t" | |
294 "movq %%mm0, (%4, %%eax) \n\t" | |
295 "movq %%mm1, (%5, %%eax) \n\t" | |
296 "addl %6, %%eax \n\t" | |
297 "movq (%2, %%eax), %%mm1 \n\t" | |
298 "movq (%3, %%eax), %%mm0 \n\t" | |
299 "movq 1(%2, %%eax), %%mm3 \n\t" | |
300 "movq 1(%3, %%eax), %%mm4 \n\t" | |
301 PAVGB" %%mm3, %%mm1 \n\t" | |
302 PAVGB" %%mm4, %%mm0 \n\t" | |
303 PAVGB" %%mm1, %%mm2 \n\t" | |
304 PAVGB" %%mm0, %%mm1 \n\t" | |
305 "movq (%4, %%eax), %%mm3 \n\t" | |
306 "movq (%5, %%eax), %%mm4 \n\t" | |
307 PAVGB" %%mm3, %%mm2 \n\t" | |
308 PAVGB" %%mm4, %%mm1 \n\t" | |
309 "movq %%mm2, (%4, %%eax) \n\t" | |
310 "movq %%mm1, (%5, %%eax) \n\t" | |
311 "addl %6, %%eax \n\t" | |
312 "subl $4, %0 \n\t" | |
313 " jnz 1b \n\t" | |
314 :"+g"(h) | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
315 :"D"(pixels), "S"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
316 "r" (block+line_size), "g"(line_size<<1) |
386 | 317 :"%eax", "memory"); |
0 | 318 } |