Mercurial > libavcodec.hg
annotate i386/dsputil_mmx_avg.h @ 387:b8f3affeb8e1 libavcodec
shared lib support (req by kabi) ...
author | michaelni |
---|---|
date | Fri, 17 May 2002 13:01:01 +0000 |
parents | f49629bab18d |
children | f874493a1970 |
rev | line source |
---|---|
0 | 1 /* |
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2 | |
3 * Copyright (c) 2000, 2001 Gerard Lantau. | |
386 | 4 * Copyright (c) 2002 Michael Niedermayer |
0 | 5 * |
6 * This program is free software; you can redistribute it and/or modify | |
7 * it under the terms of the GNU General Public License as published by | |
8 * the Free Software Foundation; either version 2 of the License, or | |
9 * (at your option) any later version. | |
10 * | |
11 * This program is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 * GNU General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU General Public License | |
17 * along with this program; if not, write to the Free Software | |
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
19 * | |
20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
386 | 21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> |
0 | 22 */ |
387 | 23 |
0 | 24 static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
25 { | |
386 | 26 __asm __volatile( |
27 "xorl %%eax, %%eax \n\t" | |
28 ".balign 16 \n\t" | |
29 "1: \n\t" | |
30 "movq (%1, %%eax), %%mm0 \n\t" | |
31 "movq 1(%1, %%eax), %%mm1 \n\t" | |
32 "movq (%2, %%eax), %%mm2 \n\t" | |
33 "movq 1(%2, %%eax), %%mm3 \n\t" | |
34 PAVGB" %%mm1, %%mm0 \n\t" | |
35 PAVGB" %%mm3, %%mm2 \n\t" | |
36 "movq %%mm0, (%3, %%eax) \n\t" | |
37 "movq %%mm2, (%4, %%eax) \n\t" | |
38 "addl %5, %%eax \n\t" | |
39 "movq (%1, %%eax), %%mm0 \n\t" | |
40 "movq 1(%1, %%eax), %%mm1 \n\t" | |
41 "movq (%2, %%eax), %%mm2 \n\t" | |
42 "movq 1(%2, %%eax), %%mm3 \n\t" | |
43 PAVGB" %%mm1, %%mm0 \n\t" | |
44 PAVGB" %%mm3, %%mm2 \n\t" | |
45 "movq %%mm0, (%3, %%eax) \n\t" | |
46 "movq %%mm2, (%4, %%eax) \n\t" | |
47 "addl %5, %%eax \n\t" | |
48 "subl $4, %0 \n\t" | |
49 " jnz 1b \n\t" | |
50 :"+g"(h) | |
51 :"r"(pixels), "r"(pixels+line_size), "r" (block), "r" (block+line_size), | |
52 "r"(line_size<<1) | |
53 :"%eax", "memory"); | |
54 } | |
55 | |
56 static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
57 { | |
0 | 58 __asm __volatile( |
386 | 59 "xorl %%eax, %%eax \n\t" |
387 | 60 MOVQ_BONE(%%mm7) |
386 | 61 ".balign 16 \n\t" |
62 "1: \n\t" | |
63 "movq (%1, %%eax), %%mm0 \n\t" | |
64 "movq 1(%1, %%eax), %%mm1 \n\t" | |
65 "movq (%2, %%eax), %%mm2 \n\t" | |
66 "movq 1(%2, %%eax), %%mm3 \n\t" | |
67 "psubusb %%mm7, %%mm0 \n\t" | |
68 "psubusb %%mm7, %%mm2 \n\t" | |
69 PAVGB" %%mm1, %%mm0 \n\t" | |
70 PAVGB" %%mm3, %%mm2 \n\t" | |
71 "movq %%mm0, (%3, %%eax) \n\t" | |
72 "movq %%mm2, (%4, %%eax) \n\t" | |
73 "addl %5, %%eax \n\t" | |
74 "movq (%1, %%eax), %%mm0 \n\t" | |
75 "movq 1(%1, %%eax), %%mm1 \n\t" | |
76 "movq (%2, %%eax), %%mm2 \n\t" | |
77 "movq 1(%2, %%eax), %%mm3 \n\t" | |
78 "psubusb %%mm7, %%mm0 \n\t" | |
79 "psubusb %%mm7, %%mm2 \n\t" | |
80 PAVGB" %%mm1, %%mm0 \n\t" | |
81 PAVGB" %%mm3, %%mm2 \n\t" | |
82 "movq %%mm0, (%3, %%eax) \n\t" | |
83 "movq %%mm2, (%4, %%eax) \n\t" | |
84 "addl %5, %%eax \n\t" | |
85 "subl $4, %0 \n\t" | |
86 " jnz 1b \n\t" | |
87 :"+g"(h) | |
88 :"r"(pixels), "r"(pixels+line_size), "r" (block), "r" (block+line_size), | |
89 "r"(line_size<<1) | |
90 :"%eax", "memory"); | |
0 | 91 } |
92 | |
93 static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
94 { | |
95 __asm __volatile( | |
386 | 96 "xorl %%eax, %%eax \n\t" |
97 "movq (%1), %%mm0 \n\t" | |
98 ".balign 16 \n\t" | |
99 "1: \n\t" | |
100 "movq (%2, %%eax), %%mm1 \n\t" | |
101 "movq (%3, %%eax), %%mm2 \n\t" | |
102 PAVGB" %%mm1, %%mm0 \n\t" | |
103 PAVGB" %%mm2, %%mm1 \n\t" | |
104 "movq %%mm0, (%4, %%eax) \n\t" | |
105 "movq %%mm1, (%5, %%eax) \n\t" | |
106 "addl %6, %%eax \n\t" | |
107 "movq (%2, %%eax), %%mm1 \n\t" | |
108 "movq (%3, %%eax), %%mm0 \n\t" | |
109 PAVGB" %%mm1, %%mm2 \n\t" | |
110 PAVGB" %%mm0, %%mm1 \n\t" | |
111 "movq %%mm2, (%4, %%eax) \n\t" | |
112 "movq %%mm1, (%5, %%eax) \n\t" | |
113 "addl %6, %%eax \n\t" | |
114 "subl $4, %0 \n\t" | |
115 " jnz 1b \n\t" | |
116 :"+g"(h) | |
117 :"r"(pixels), "r"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), | |
118 "r" (block+line_size), "g"(line_size<<1) | |
119 :"%eax", "memory"); | |
120 } | |
121 | |
122 static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
123 { | |
0 | 124 __asm __volatile( |
387 | 125 MOVQ_BONE(%%mm7) |
386 | 126 "xorl %%eax, %%eax \n\t" |
127 "movq (%1), %%mm0 \n\t" | |
128 ".balign 16 \n\t" | |
129 "1: \n\t" | |
130 "movq (%2, %%eax), %%mm1 \n\t" | |
131 "movq (%3, %%eax), %%mm2 \n\t" | |
132 "psubusb %%mm7, %%mm1 \n\t" | |
133 PAVGB" %%mm1, %%mm0 \n\t" | |
134 PAVGB" %%mm2, %%mm1 \n\t" | |
135 "movq %%mm0, (%4, %%eax) \n\t" | |
136 "movq %%mm1, (%5, %%eax) \n\t" | |
137 "addl %6, %%eax \n\t" | |
138 "movq (%2, %%eax), %%mm1 \n\t" | |
139 "movq (%3, %%eax), %%mm0 \n\t" | |
140 "psubusb %%mm7, %%mm1 \n\t" | |
141 PAVGB" %%mm1, %%mm2 \n\t" | |
142 PAVGB" %%mm0, %%mm1 \n\t" | |
143 "movq %%mm2, (%4, %%eax) \n\t" | |
144 "movq %%mm1, (%5, %%eax) \n\t" | |
145 "addl %6, %%eax \n\t" | |
146 "subl $4, %0 \n\t" | |
147 " jnz 1b \n\t" | |
148 :"+g"(h) | |
149 :"r"(pixels), "r"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), | |
150 "r" (block+line_size), "g"(line_size<<1) | |
151 :"%eax", "memory"); | |
0 | 152 } |
153 | |
154 static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
155 { | |
156 __asm __volatile( | |
386 | 157 "xorl %%eax, %%eax \n\t" |
158 ".balign 16 \n\t" | |
159 "1: \n\t" | |
160 "movq (%1, %%eax), %%mm0 \n\t" | |
161 "movq (%2, %%eax), %%mm2 \n\t" | |
162 "movq (%3, %%eax), %%mm3 \n\t" | |
163 "movq (%4, %%eax), %%mm4 \n\t" | |
164 PAVGB" %%mm3, %%mm0 \n\t" | |
165 PAVGB" %%mm4, %%mm2 \n\t" | |
166 "movq %%mm0, (%3, %%eax) \n\t" | |
167 "movq %%mm2, (%4, %%eax) \n\t" | |
168 "addl %5, %%eax \n\t" | |
169 "movq (%1, %%eax), %%mm0 \n\t" | |
170 "movq (%2, %%eax), %%mm2 \n\t" | |
171 "movq (%3, %%eax), %%mm3 \n\t" | |
172 "movq (%4, %%eax), %%mm4 \n\t" | |
173 PAVGB" %%mm3, %%mm0 \n\t" | |
174 PAVGB" %%mm4, %%mm2 \n\t" | |
175 "movq %%mm0, (%3, %%eax) \n\t" | |
176 "movq %%mm2, (%4, %%eax) \n\t" | |
177 "addl %5, %%eax \n\t" | |
178 "subl $4, %0 \n\t" | |
179 " jnz 1b \n\t" | |
180 :"+g"(h) | |
181 :"r"(pixels), "r"(pixels+line_size), "r" (block), "r" (block+line_size), | |
182 "r"(line_size<<1) | |
183 :"%eax", "memory"); | |
0 | 184 } |
185 | |
386 | 186 static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
0 | 187 { |
188 __asm __volatile( | |
386 | 189 "xorl %%eax, %%eax \n\t" |
190 ".balign 16 \n\t" | |
191 "1: \n\t" | |
192 "movq (%1, %%eax), %%mm0 \n\t" | |
193 "movq 1(%1, %%eax), %%mm1 \n\t" | |
194 "movq (%2, %%eax), %%mm2 \n\t" | |
195 "movq 1(%2, %%eax), %%mm3 \n\t" | |
196 PAVGB" %%mm1, %%mm0 \n\t" | |
197 PAVGB" %%mm3, %%mm2 \n\t" | |
198 "movq (%3, %%eax), %%mm3 \n\t" | |
199 "movq (%4, %%eax), %%mm4 \n\t" | |
200 PAVGB" %%mm3, %%mm0 \n\t" | |
201 PAVGB" %%mm4, %%mm2 \n\t" | |
202 "movq %%mm0, (%3, %%eax) \n\t" | |
203 "movq %%mm2, (%4, %%eax) \n\t" | |
204 "addl %5, %%eax \n\t" | |
205 "movq (%1, %%eax), %%mm0 \n\t" | |
206 "movq 1(%1, %%eax), %%mm1 \n\t" | |
207 "movq (%2, %%eax), %%mm2 \n\t" | |
208 "movq 1(%2, %%eax), %%mm3 \n\t" | |
209 PAVGB" %%mm1, %%mm0 \n\t" | |
210 PAVGB" %%mm3, %%mm2 \n\t" | |
211 "movq (%3, %%eax), %%mm3 \n\t" | |
212 "movq (%4, %%eax), %%mm4 \n\t" | |
213 PAVGB" %%mm3, %%mm0 \n\t" | |
214 PAVGB" %%mm4, %%mm2 \n\t" | |
215 "movq %%mm0, (%3, %%eax) \n\t" | |
216 "movq %%mm2, (%4, %%eax) \n\t" | |
217 "addl %5, %%eax \n\t" | |
218 "subl $4, %0 \n\t" | |
219 " jnz 1b \n\t" | |
220 :"+g"(h) | |
221 :"r"(pixels), "r"(pixels+line_size), "r" (block), "r" (block+line_size), | |
222 "r"(line_size<<1) | |
223 :"%eax", "memory"); | |
0 | 224 } |
225 | |
386 | 226 static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
0 | 227 { |
228 __asm __volatile( | |
386 | 229 "xorl %%eax, %%eax \n\t" |
230 "movq (%1), %%mm0 \n\t" | |
231 ".balign 16 \n\t" | |
232 "1: \n\t" | |
233 "movq (%2, %%eax), %%mm1 \n\t" | |
234 "movq (%3, %%eax), %%mm2 \n\t" | |
235 PAVGB" %%mm1, %%mm0 \n\t" | |
236 PAVGB" %%mm2, %%mm1 \n\t" | |
237 "movq (%4, %%eax), %%mm3 \n\t" | |
238 "movq (%5, %%eax), %%mm4 \n\t" | |
239 PAVGB" %%mm3, %%mm0 \n\t" | |
240 PAVGB" %%mm4, %%mm1 \n\t" | |
241 "movq %%mm0, (%4, %%eax) \n\t" | |
242 "movq %%mm1, (%5, %%eax) \n\t" | |
243 "addl %6, %%eax \n\t" | |
244 "movq (%2, %%eax), %%mm1 \n\t" | |
245 "movq (%3, %%eax), %%mm0 \n\t" | |
246 PAVGB" %%mm1, %%mm2 \n\t" | |
247 PAVGB" %%mm0, %%mm1 \n\t" | |
248 "movq (%4, %%eax), %%mm3 \n\t" | |
249 "movq (%5, %%eax), %%mm4 \n\t" | |
250 PAVGB" %%mm3, %%mm2 \n\t" | |
251 PAVGB" %%mm4, %%mm1 \n\t" | |
252 "movq %%mm2, (%4, %%eax) \n\t" | |
253 "movq %%mm1, (%5, %%eax) \n\t" | |
254 "addl %6, %%eax \n\t" | |
255 "subl $4, %0 \n\t" | |
256 " jnz 1b \n\t" | |
257 :"+g"(h) | |
258 :"r"(pixels), "r"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), | |
259 "r" (block+line_size), "g"(line_size<<1) | |
260 :"%eax", "memory"); | |
0 | 261 } |
262 | |
386 | 263 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter |
264 static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
0 | 265 { |
266 __asm __volatile( | |
387 | 267 MOVQ_BONE(%%mm7) |
386 | 268 "xorl %%eax, %%eax \n\t" |
269 "movq (%1), %%mm0 \n\t" | |
270 "movq 1(%1), %%mm1 \n\t" | |
271 PAVGB" %%mm1, %%mm0 \n\t" | |
272 ".balign 16 \n\t" | |
273 "1: \n\t" | |
274 "movq (%2, %%eax), %%mm1 \n\t" | |
275 "movq (%3, %%eax), %%mm2 \n\t" | |
276 "movq 1(%2, %%eax), %%mm3 \n\t" | |
277 "movq 1(%3, %%eax), %%mm4 \n\t" | |
278 "psubusb %%mm7, %%mm2 \n\t" | |
279 PAVGB" %%mm3, %%mm1 \n\t" | |
280 PAVGB" %%mm4, %%mm2 \n\t" | |
281 PAVGB" %%mm1, %%mm0 \n\t" | |
282 PAVGB" %%mm2, %%mm1 \n\t" | |
283 "movq (%4, %%eax), %%mm3 \n\t" | |
284 "movq (%5, %%eax), %%mm4 \n\t" | |
285 PAVGB" %%mm3, %%mm0 \n\t" | |
286 PAVGB" %%mm4, %%mm1 \n\t" | |
287 "movq %%mm0, (%4, %%eax) \n\t" | |
288 "movq %%mm1, (%5, %%eax) \n\t" | |
289 "addl %6, %%eax \n\t" | |
290 "movq (%2, %%eax), %%mm1 \n\t" | |
291 "movq (%3, %%eax), %%mm0 \n\t" | |
292 "movq 1(%2, %%eax), %%mm3 \n\t" | |
293 "movq 1(%3, %%eax), %%mm4 \n\t" | |
294 PAVGB" %%mm3, %%mm1 \n\t" | |
295 PAVGB" %%mm4, %%mm0 \n\t" | |
296 PAVGB" %%mm1, %%mm2 \n\t" | |
297 PAVGB" %%mm0, %%mm1 \n\t" | |
298 "movq (%4, %%eax), %%mm3 \n\t" | |
299 "movq (%5, %%eax), %%mm4 \n\t" | |
300 PAVGB" %%mm3, %%mm2 \n\t" | |
301 PAVGB" %%mm4, %%mm1 \n\t" | |
302 "movq %%mm2, (%4, %%eax) \n\t" | |
303 "movq %%mm1, (%5, %%eax) \n\t" | |
304 "addl %6, %%eax \n\t" | |
305 "subl $4, %0 \n\t" | |
306 " jnz 1b \n\t" | |
307 :"+g"(h) | |
308 :"r"(pixels), "r"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), | |
309 "r" (block+line_size), "g"(line_size<<1) | |
310 :"%eax", "memory"); | |
0 | 311 } |
312 | |
386 | 313 //Note: the sub* functions are no used |
314 | |
0 | 315 static void DEF(sub_pixels_x2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h) |
316 { | |
317 DCTELEM *p; | |
318 const UINT8 *pix; | |
319 p = block; | |
320 pix = pixels; | |
321 __asm __volatile( | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
322 "pxor %%mm7, %%mm7":); |
0 | 323 do { |
324 __asm __volatile( | |
325 "movq 1%1, %%mm2\n\t" | |
326 "movq %0, %%mm0\n\t" | |
327 PAVGB" %1, %%mm2\n\t" | |
328 "movq 8%0, %%mm1\n\t" | |
329 "movq %%mm2, %%mm3\n\t" | |
330 "punpcklbw %%mm7, %%mm2\n\t" | |
331 "punpckhbw %%mm7, %%mm3\n\t" | |
332 "psubsw %%mm2, %%mm0\n\t" | |
333 "psubsw %%mm3, %%mm1\n\t" | |
334 "movq %%mm0, %0\n\t" | |
335 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
336 :"+m"(*p) |
0 | 337 :"m"(*pix) |
338 :"memory"); | |
339 pix += line_size; | |
340 p += 8; | |
341 } while (--h); | |
342 } | |
343 | |
344 static void DEF(sub_pixels_y2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
345 { | |
346 DCTELEM *p; | |
347 const UINT8 *pix; | |
348 p = block; | |
349 pix = pixels; | |
350 __asm __volatile( | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
351 "pxor %%mm7, %%mm7":); |
0 | 352 do { |
353 __asm __volatile( | |
354 "movq %2, %%mm2\n\t" | |
355 "movq %0, %%mm0\n\t" | |
356 PAVGB" %1, %%mm2\n\t" | |
357 "movq 8%0, %%mm1\n\t" | |
358 "movq %%mm2, %%mm3\n\t" | |
359 "punpcklbw %%mm7, %%mm2\n\t" | |
360 "punpckhbw %%mm7, %%mm3\n\t" | |
361 "psubsw %%mm2, %%mm0\n\t" | |
362 "psubsw %%mm3, %%mm1\n\t" | |
363 "movq %%mm0, %0\n\t" | |
364 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
365 :"+m"(*p) |
0 | 366 :"m"(*pix), "m"(*(pix+line_size)) |
367 :"memory"); | |
368 pix += line_size; | |
369 p += 8; | |
370 } while (--h); | |
371 } | |
372 |