Mercurial > libavcodec.hg
annotate i386/dsputil_mmx_avg.h @ 157:bc12fd7e6153 libavcodec
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
author | michael |
---|---|
date | Wed, 14 Nov 2001 02:46:58 +0000 |
parents | ae0516eadae2 |
children | f49629bab18d |
rev | line source |
---|---|
0 | 1 /* |
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2 | |
3 * Copyright (c) 2000, 2001 Gerard Lantau. | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
18 * | |
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 */ | |
21 | |
22 static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
23 { | |
24 int dh, hh; | |
25 UINT8 *p; | |
26 const UINT8 *pix; | |
27 p = block; | |
28 pix = pixels; | |
29 hh=h>>2; | |
30 dh=h&3; | |
31 while(hh--) { | |
32 __asm __volatile( | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
33 "movq (%1), %%mm0\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
34 "movq 1(%1), %%mm1\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
35 "movq (%1, %2), %%mm2\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
36 "movq 1(%1, %2), %%mm3\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
37 "movq (%1, %2, 2), %%mm4\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
38 "movq 1(%1, %2, 2), %%mm5\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
39 "movq (%1, %3), %%mm6\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
40 "movq 1(%1, %3), %%mm7\n\t" |
0 | 41 PAVGB" %%mm1, %%mm0\n\t" |
42 PAVGB" %%mm3, %%mm2\n\t" | |
43 PAVGB" %%mm5, %%mm4\n\t" | |
44 PAVGB" %%mm7, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
45 "movq %%mm0, (%0)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
46 "movq %%mm2, (%0, %2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
47 "movq %%mm4, (%0, %2, 2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
48 "movq %%mm6, (%0, %3)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
49 ::"r"(p), "r"(pix), "r" (line_size), "r" (line_size*3) |
0 | 50 :"memory"); |
51 pix += line_size*4; p += line_size*4; | |
52 } | |
53 while(dh--) { | |
54 __asm __volatile( | |
55 "movq %1, %%mm0\n\t" | |
56 "movq 1%1, %%mm1\n\t" | |
57 PAVGB" %%mm1, %%mm0\n\t" | |
58 "movq %%mm0, %0\n\t" | |
59 :"=m"(*p) | |
60 :"m"(*pix) | |
61 :"memory"); | |
62 pix += line_size; p += line_size; | |
63 } | |
64 } | |
65 | |
66 static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
67 { | |
68 int dh, hh; | |
69 UINT8 *p; | |
70 const UINT8 *pix; | |
71 p = block; | |
72 pix = pixels; | |
73 | |
74 hh=h>>1; | |
75 dh=h&1; | |
76 while(hh--) { | |
77 __asm __volatile( | |
78 "movq %2, %%mm0\n\t" | |
79 "movq %3, %%mm1\n\t" | |
80 "movq %4, %%mm2\n\t" | |
81 PAVGB" %%mm1, %%mm0\n\t" | |
82 PAVGB" %%mm2, %%mm1\n\t" | |
83 "movq %%mm0, %0\n\t" | |
84 "movq %%mm1, %1\n\t" | |
85 :"=m"(*p), "=m"(*(p+line_size)) | |
86 :"m"(*pix), "m"(*(pix+line_size)), | |
87 "m"(*(pix+line_size*2)) | |
88 :"memory"); | |
89 pix += line_size*2; | |
90 p += line_size*2; | |
91 } | |
92 if(dh) { | |
93 __asm __volatile( | |
94 "movq %1, %%mm0\n\t" | |
95 "movq %2, %%mm1\n\t" | |
96 PAVGB" %%mm1, %%mm0\n\t" | |
97 "movq %%mm0, %0\n\t" | |
98 :"=m"(*p) | |
99 :"m"(*pix), | |
100 "m"(*(pix+line_size)) | |
101 :"memory"); | |
102 } | |
103 } | |
104 | |
105 static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
106 { | |
107 int dh, hh; | |
108 UINT8 *p; | |
109 const UINT8 *pix; | |
110 p = block; | |
111 pix = pixels; | |
112 hh=h>>2; | |
113 dh=h&3; | |
114 while(hh--) { | |
115 __asm __volatile( | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
116 "movq (%0), %%mm0\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
117 "movq (%1), %%mm1\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
118 "movq (%0, %2), %%mm2\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
119 "movq (%1, %2), %%mm3\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
120 "movq (%0, %2, 2), %%mm4\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
121 "movq (%1, %2, 2), %%mm5\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
122 "movq (%0, %3), %%mm6\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
123 "movq (%1, %3), %%mm7\n\t" |
0 | 124 PAVGB" %%mm1, %%mm0\n\t" |
125 PAVGB" %%mm3, %%mm2\n\t" | |
126 PAVGB" %%mm5, %%mm4\n\t" | |
127 PAVGB" %%mm7, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
128 "movq %%mm0, (%0)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
129 "movq %%mm2, (%0, %2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
130 "movq %%mm4, (%0, %2, 2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
131 "movq %%mm6, (%0, %3)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
132 ::"r"(p), "r"(pix), "r" (line_size), "r" (line_size*3) |
0 | 133 :"memory"); |
134 pix += line_size*4; p += line_size*4; | |
135 } | |
136 while(dh--) { | |
137 __asm __volatile( | |
138 "movq %0, %%mm0\n\t" | |
139 "movq %1, %%mm1\n\t" | |
140 PAVGB" %%mm1, %%mm0\n\t" | |
141 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
142 :"+m"(*p) |
0 | 143 :"m"(*pix) |
144 :"memory"); | |
145 pix += line_size; p += line_size; | |
146 } | |
147 } | |
148 | |
149 static void DEF(avg_pixels_x2)( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
150 { | |
151 int dh, hh; | |
152 UINT8 *p; | |
153 const UINT8 *pix; | |
154 p = block; | |
155 pix = pixels; | |
156 hh=h>>1; | |
157 dh=h&1; | |
158 while(hh--) { | |
159 __asm __volatile( | |
160 "movq %2, %%mm2\n\t" | |
161 "movq 1%2, %%mm3\n\t" | |
162 "movq %3, %%mm4\n\t" | |
163 "movq 1%3, %%mm5\n\t" | |
164 "movq %0, %%mm0\n\t" | |
165 "movq %1, %%mm1\n\t" | |
166 PAVGB" %%mm3, %%mm2\n\t" | |
167 PAVGB" %%mm2, %%mm0\n\t" | |
168 PAVGB" %%mm5, %%mm4\n\t" | |
169 PAVGB" %%mm4, %%mm1\n\t" | |
170 "movq %%mm0, %0\n\t" | |
171 "movq %%mm1, %1\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
172 :"+m"(*p), "+m"(*(p+line_size)) |
0 | 173 :"m"(*pix), "m"(*(pix+line_size)) |
174 :"memory"); | |
175 pix += line_size*2; | |
176 p += line_size*2; | |
177 } | |
178 if(dh) { | |
179 __asm __volatile( | |
180 "movq %1, %%mm1\n\t" | |
181 "movq 1%1, %%mm2\n\t" | |
182 "movq %0, %%mm0\n\t" | |
183 PAVGB" %%mm2, %%mm1\n\t" | |
184 PAVGB" %%mm1, %%mm0\n\t" | |
185 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
186 :"+m"(*p) |
0 | 187 :"m"(*pix) |
188 :"memory"); | |
189 } | |
190 } | |
191 | |
192 static void DEF(avg_pixels_y2)( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
193 { | |
194 int dh, hh; | |
195 UINT8 *p; | |
196 const UINT8 *pix; | |
197 p = block; | |
198 pix = pixels; | |
199 hh=h>>1; | |
200 dh=h&1; | |
201 while(hh--) { | |
202 __asm __volatile( | |
203 "movq %2, %%mm2\n\t" | |
204 "movq %3, %%mm3\n\t" | |
205 "movq %3, %%mm4\n\t" | |
206 "movq %4, %%mm5\n\t" | |
207 "movq %0, %%mm0\n\t" | |
208 "movq %1, %%mm1\n\t" | |
209 PAVGB" %%mm3, %%mm2\n\t" | |
210 PAVGB" %%mm2, %%mm0\n\t" | |
211 PAVGB" %%mm5, %%mm4\n\t" | |
212 PAVGB" %%mm4, %%mm1\n\t" | |
213 "movq %%mm0, %0\n\t" | |
214 "movq %%mm1, %1\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
215 :"+m"(*p), "+m"(*(p+line_size)) |
0 | 216 :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)) |
217 :"memory"); | |
218 pix += line_size*2; | |
219 p += line_size*2; | |
220 } | |
221 if(dh) { | |
222 __asm __volatile( | |
223 "movq %1, %%mm1\n\t" | |
224 "movq %2, %%mm2\n\t" | |
225 "movq %0, %%mm0\n\t" | |
226 PAVGB" %%mm2, %%mm1\n\t" | |
227 PAVGB" %%mm1, %%mm0\n\t" | |
228 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
229 :"+m"(*p) |
0 | 230 :"m"(*pix), "m"(*(pix+line_size)) |
231 :"memory"); | |
232 } | |
233 } | |
234 | |
235 static void DEF(avg_pixels_xy2)( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
236 { | |
237 UINT8 *p; | |
238 const UINT8 *pix; | |
239 p = block; | |
240 pix = pixels; | |
241 __asm __volatile( | |
242 "pxor %%mm7, %%mm7\n\t" | |
243 "movq %0, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
244 ::"m"(mm_wtwo)); |
0 | 245 do { |
246 __asm __volatile( | |
247 "movq %1, %%mm0\n\t" | |
248 "movq %2, %%mm1\n\t" | |
249 "movq 1%1, %%mm4\n\t" | |
250 "movq 1%2, %%mm5\n\t" | |
251 "movq %%mm0, %%mm2\n\t" | |
252 "movq %%mm1, %%mm3\n\t" | |
253 "punpcklbw %%mm7, %%mm0\n\t" | |
254 "punpcklbw %%mm7, %%mm1\n\t" | |
255 "punpckhbw %%mm7, %%mm2\n\t" | |
256 "punpckhbw %%mm7, %%mm3\n\t" | |
257 "paddusw %%mm1, %%mm0\n\t" | |
258 "paddusw %%mm3, %%mm2\n\t" | |
259 "movq %%mm4, %%mm1\n\t" | |
260 "movq %%mm5, %%mm3\n\t" | |
261 "punpcklbw %%mm7, %%mm4\n\t" | |
262 "punpcklbw %%mm7, %%mm5\n\t" | |
263 "punpckhbw %%mm7, %%mm1\n\t" | |
264 "punpckhbw %%mm7, %%mm3\n\t" | |
265 "paddusw %%mm5, %%mm4\n\t" | |
266 "paddusw %%mm3, %%mm1\n\t" | |
267 "paddusw %%mm6, %%mm4\n\t" | |
268 "paddusw %%mm6, %%mm1\n\t" | |
269 "paddusw %%mm4, %%mm0\n\t" | |
270 "paddusw %%mm1, %%mm2\n\t" | |
271 "psrlw $2, %%mm0\n\t" | |
272 "psrlw $2, %%mm2\n\t" | |
273 "packuswb %%mm2, %%mm0\n\t" | |
274 PAVGB" %0, %%mm0\n\t" | |
275 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
276 :"+m"(*p) |
0 | 277 :"m"(*pix), |
278 "m"(*(pix+line_size)) | |
279 :"memory"); | |
280 pix += line_size; | |
281 p += line_size ; | |
282 } while(--h); | |
283 } | |
284 | |
285 static void DEF(sub_pixels_x2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
286 { | |
287 DCTELEM *p; | |
288 const UINT8 *pix; | |
289 p = block; | |
290 pix = pixels; | |
291 __asm __volatile( | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
292 "pxor %%mm7, %%mm7":); |
0 | 293 do { |
294 __asm __volatile( | |
295 "movq 1%1, %%mm2\n\t" | |
296 "movq %0, %%mm0\n\t" | |
297 PAVGB" %1, %%mm2\n\t" | |
298 "movq 8%0, %%mm1\n\t" | |
299 "movq %%mm2, %%mm3\n\t" | |
300 "punpcklbw %%mm7, %%mm2\n\t" | |
301 "punpckhbw %%mm7, %%mm3\n\t" | |
302 "psubsw %%mm2, %%mm0\n\t" | |
303 "psubsw %%mm3, %%mm1\n\t" | |
304 "movq %%mm0, %0\n\t" | |
305 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
306 :"+m"(*p) |
0 | 307 :"m"(*pix) |
308 :"memory"); | |
309 pix += line_size; | |
310 p += 8; | |
311 } while (--h); | |
312 } | |
313 | |
314 static void DEF(sub_pixels_y2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h) | |
315 { | |
316 DCTELEM *p; | |
317 const UINT8 *pix; | |
318 p = block; | |
319 pix = pixels; | |
320 __asm __volatile( | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
321 "pxor %%mm7, %%mm7":); |
0 | 322 do { |
323 __asm __volatile( | |
324 "movq %2, %%mm2\n\t" | |
325 "movq %0, %%mm0\n\t" | |
326 PAVGB" %1, %%mm2\n\t" | |
327 "movq 8%0, %%mm1\n\t" | |
328 "movq %%mm2, %%mm3\n\t" | |
329 "punpcklbw %%mm7, %%mm2\n\t" | |
330 "punpckhbw %%mm7, %%mm3\n\t" | |
331 "psubsw %%mm2, %%mm0\n\t" | |
332 "psubsw %%mm3, %%mm1\n\t" | |
333 "movq %%mm0, %0\n\t" | |
334 "movq %%mm1, 8%0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
8
diff
changeset
|
335 :"+m"(*p) |
0 | 336 :"m"(*pix), "m"(*(pix+line_size)) |
337 :"memory"); | |
338 pix += line_size; | |
339 p += 8; | |
340 } while (--h); | |
341 } | |
342 |