Mercurial > libavcodec.hg
comparison i386/dsputil_mmx_avg.h @ 2207:22b768f1261a libavcodec
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
now
author | michael |
---|---|
date | Mon, 06 Sep 2004 03:17:31 +0000 |
parents | f65d87bfdd5a |
children | c4a476971abc |
comparison
equal
deleted
inserted
replaced
2206:713ad427a3c7 | 2207:22b768f1261a |
---|---|
51 :"+g"(h), "+S"(pixels), "+D"(block) | 51 :"+g"(h), "+S"(pixels), "+D"(block) |
52 :"r" (line_size) | 52 :"r" (line_size) |
53 :"%eax", "memory"); | 53 :"%eax", "memory"); |
54 } | 54 } |
55 | 55 |
56 static __attribute__((unused)) void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | 56 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
57 { | 57 { |
58 __asm __volatile( | 58 __asm __volatile( |
59 "testl $1, %0 \n\t" | |
60 " jz 1f \n\t" | |
61 "movq (%1), %%mm0 \n\t" | |
62 "movq (%2), %%mm1 \n\t" | |
63 "addl %4, %1 \n\t" | |
64 "addl $8, %2 \n\t" | |
65 PAVGB" %%mm1, %%mm0 \n\t" | |
66 "movq %%mm0, (%3) \n\t" | |
67 "addl %5, %3 \n\t" | |
68 "decl %0 \n\t" | |
59 "1: \n\t" | 69 "1: \n\t" |
60 "movq (%1), %%mm0 \n\t" | 70 "movq (%1), %%mm0 \n\t" |
61 "addl %4, %1 \n\t" | 71 "addl %4, %1 \n\t" |
62 "movq (%1), %%mm1 \n\t" | 72 "movq (%1), %%mm1 \n\t" |
63 "addl %4, %1 \n\t" | 73 "addl %4, %1 \n\t" |
78 "movq %%mm1, (%3) \n\t" | 88 "movq %%mm1, (%3) \n\t" |
79 "addl %5, %3 \n\t" | 89 "addl %5, %3 \n\t" |
80 "addl $32, %2 \n\t" | 90 "addl $32, %2 \n\t" |
81 "subl $4, %0 \n\t" | 91 "subl $4, %0 \n\t" |
82 "jnz 1b \n\t" | 92 "jnz 1b \n\t" |
83 :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | 93 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used |
94 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
95 #else | |
96 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
97 #endif | |
98 :"S"(src1Stride), "D"(dstStride) | |
99 :"memory"); | |
100 //the following should be used, though better not with gcc ... | |
101 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
84 :"r"(src1Stride), "r"(dstStride) | 102 :"r"(src1Stride), "r"(dstStride) |
85 :"memory"); | 103 :"memory");*/ |
104 } | |
105 | |
106 static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
107 { | |
108 __asm __volatile( | |
109 "pcmpeqb %%mm6, %%mm6 \n\t" | |
110 "testl $1, %0 \n\t" | |
111 " jz 1f \n\t" | |
112 "movq (%1), %%mm0 \n\t" | |
113 "movq (%2), %%mm1 \n\t" | |
114 "addl %4, %1 \n\t" | |
115 "addl $8, %2 \n\t" | |
116 "pxor %%mm6, %%mm0 \n\t" | |
117 "pxor %%mm6, %%mm1 \n\t" | |
118 PAVGB" %%mm1, %%mm0 \n\t" | |
119 "pxor %%mm6, %%mm0 \n\t" | |
120 "movq %%mm0, (%3) \n\t" | |
121 "addl %5, %3 \n\t" | |
122 "decl %0 \n\t" | |
123 "1: \n\t" | |
124 "movq (%1), %%mm0 \n\t" | |
125 "addl %4, %1 \n\t" | |
126 "movq (%1), %%mm1 \n\t" | |
127 "addl %4, %1 \n\t" | |
128 "movq (%2), %%mm2 \n\t" | |
129 "movq 8(%2), %%mm3 \n\t" | |
130 "pxor %%mm6, %%mm0 \n\t" | |
131 "pxor %%mm6, %%mm1 \n\t" | |
132 "pxor %%mm6, %%mm2 \n\t" | |
133 "pxor %%mm6, %%mm3 \n\t" | |
134 PAVGB" %%mm2, %%mm0 \n\t" | |
135 PAVGB" %%mm3, %%mm1 \n\t" | |
136 "pxor %%mm6, %%mm0 \n\t" | |
137 "pxor %%mm6, %%mm1 \n\t" | |
138 "movq %%mm0, (%3) \n\t" | |
139 "addl %5, %3 \n\t" | |
140 "movq %%mm1, (%3) \n\t" | |
141 "addl %5, %3 \n\t" | |
142 "movq (%1), %%mm0 \n\t" | |
143 "addl %4, %1 \n\t" | |
144 "movq (%1), %%mm1 \n\t" | |
145 "addl %4, %1 \n\t" | |
146 "movq 16(%2), %%mm2 \n\t" | |
147 "movq 24(%2), %%mm3 \n\t" | |
148 "pxor %%mm6, %%mm0 \n\t" | |
149 "pxor %%mm6, %%mm1 \n\t" | |
150 "pxor %%mm6, %%mm2 \n\t" | |
151 "pxor %%mm6, %%mm3 \n\t" | |
152 PAVGB" %%mm2, %%mm0 \n\t" | |
153 PAVGB" %%mm3, %%mm1 \n\t" | |
154 "pxor %%mm6, %%mm0 \n\t" | |
155 "pxor %%mm6, %%mm1 \n\t" | |
156 "movq %%mm0, (%3) \n\t" | |
157 "addl %5, %3 \n\t" | |
158 "movq %%mm1, (%3) \n\t" | |
159 "addl %5, %3 \n\t" | |
160 "addl $32, %2 \n\t" | |
161 "subl $4, %0 \n\t" | |
162 "jnz 1b \n\t" | |
163 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used | |
164 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
165 #else | |
166 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
167 #endif | |
168 :"S"(src1Stride), "D"(dstStride) | |
169 :"memory"); | |
170 //the following should be used, though better not with gcc ... | |
171 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
172 :"r"(src1Stride), "r"(dstStride) | |
173 :"memory");*/ | |
174 } | |
175 | |
176 static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
177 { | |
178 __asm __volatile( | |
179 "testl $1, %0 \n\t" | |
180 " jz 1f \n\t" | |
181 "movq (%1), %%mm0 \n\t" | |
182 "movq (%2), %%mm1 \n\t" | |
183 "addl %4, %1 \n\t" | |
184 "addl $8, %2 \n\t" | |
185 PAVGB" %%mm1, %%mm0 \n\t" | |
186 PAVGB" (%3), %%mm0 \n\t" | |
187 "movq %%mm0, (%3) \n\t" | |
188 "addl %5, %3 \n\t" | |
189 "decl %0 \n\t" | |
190 "1: \n\t" | |
191 "movq (%1), %%mm0 \n\t" | |
192 "addl %4, %1 \n\t" | |
193 "movq (%1), %%mm1 \n\t" | |
194 "addl %4, %1 \n\t" | |
195 PAVGB" (%2), %%mm0 \n\t" | |
196 PAVGB" 8(%2), %%mm1 \n\t" | |
197 PAVGB" (%3), %%mm0 \n\t" | |
198 "movq %%mm0, (%3) \n\t" | |
199 "addl %5, %3 \n\t" | |
200 PAVGB" (%3), %%mm1 \n\t" | |
201 "movq %%mm1, (%3) \n\t" | |
202 "addl %5, %3 \n\t" | |
203 "movq (%1), %%mm0 \n\t" | |
204 "addl %4, %1 \n\t" | |
205 "movq (%1), %%mm1 \n\t" | |
206 "addl %4, %1 \n\t" | |
207 PAVGB" 16(%2), %%mm0 \n\t" | |
208 PAVGB" 24(%2), %%mm1 \n\t" | |
209 PAVGB" (%3), %%mm0 \n\t" | |
210 "movq %%mm0, (%3) \n\t" | |
211 "addl %5, %3 \n\t" | |
212 PAVGB" (%3), %%mm1 \n\t" | |
213 "movq %%mm1, (%3) \n\t" | |
214 "addl %5, %3 \n\t" | |
215 "addl $32, %2 \n\t" | |
216 "subl $4, %0 \n\t" | |
217 "jnz 1b \n\t" | |
218 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used | |
219 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
220 #else | |
221 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
222 #endif | |
223 :"S"(src1Stride), "D"(dstStride) | |
224 :"memory"); | |
225 //the following should be used, though better not with gcc ... | |
226 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
227 :"r"(src1Stride), "r"(dstStride) | |
228 :"memory");*/ | |
86 } | 229 } |
87 | 230 |
88 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 231 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
89 { | 232 { |
90 __asm __volatile( | 233 __asm __volatile( |
123 :"+g"(h), "+S"(pixels), "+D"(block) | 266 :"+g"(h), "+S"(pixels), "+D"(block) |
124 :"r" (line_size) | 267 :"r" (line_size) |
125 :"%eax", "memory"); | 268 :"%eax", "memory"); |
126 } | 269 } |
127 | 270 |
128 static __attribute__((unused)) void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | 271 static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
129 { | 272 { |
130 __asm __volatile( | 273 __asm __volatile( |
274 "testl $1, %0 \n\t" | |
275 " jz 1f \n\t" | |
276 "movq (%1), %%mm0 \n\t" | |
277 "movq 8(%1), %%mm1 \n\t" | |
278 PAVGB" (%2), %%mm0 \n\t" | |
279 PAVGB" 8(%2), %%mm1 \n\t" | |
280 "addl %4, %1 \n\t" | |
281 "addl $16, %2 \n\t" | |
282 "movq %%mm0, (%3) \n\t" | |
283 "movq %%mm1, 8(%3) \n\t" | |
284 "addl %5, %3 \n\t" | |
285 "decl %0 \n\t" | |
131 "1: \n\t" | 286 "1: \n\t" |
132 "movq (%1), %%mm0 \n\t" | 287 "movq (%1), %%mm0 \n\t" |
133 "movq 8(%1), %%mm1 \n\t" | 288 "movq 8(%1), %%mm1 \n\t" |
134 "addl %4, %1 \n\t" | 289 "addl %4, %1 \n\t" |
135 PAVGB" (%2), %%mm0 \n\t" | 290 PAVGB" (%2), %%mm0 \n\t" |
146 "movq %%mm1, 8(%3) \n\t" | 301 "movq %%mm1, 8(%3) \n\t" |
147 "addl %5, %3 \n\t" | 302 "addl %5, %3 \n\t" |
148 "addl $32, %2 \n\t" | 303 "addl $32, %2 \n\t" |
149 "subl $2, %0 \n\t" | 304 "subl $2, %0 \n\t" |
150 "jnz 1b \n\t" | 305 "jnz 1b \n\t" |
151 :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | 306 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used |
307 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
308 #else | |
309 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
310 #endif | |
311 :"S"(src1Stride), "D"(dstStride) | |
312 :"memory"); | |
313 //the following should be used, though better not with gcc ... | |
314 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
152 :"r"(src1Stride), "r"(dstStride) | 315 :"r"(src1Stride), "r"(dstStride) |
153 :"memory"); | 316 :"memory");*/ |
317 } | |
318 | |
319 static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
320 { | |
321 __asm __volatile( | |
322 "testl $1, %0 \n\t" | |
323 " jz 1f \n\t" | |
324 "movq (%1), %%mm0 \n\t" | |
325 "movq 8(%1), %%mm1 \n\t" | |
326 PAVGB" (%2), %%mm0 \n\t" | |
327 PAVGB" 8(%2), %%mm1 \n\t" | |
328 "addl %4, %1 \n\t" | |
329 "addl $16, %2 \n\t" | |
330 PAVGB" (%3), %%mm0 \n\t" | |
331 PAVGB" 8(%3), %%mm1 \n\t" | |
332 "movq %%mm0, (%3) \n\t" | |
333 "movq %%mm1, 8(%3) \n\t" | |
334 "addl %5, %3 \n\t" | |
335 "decl %0 \n\t" | |
336 "1: \n\t" | |
337 "movq (%1), %%mm0 \n\t" | |
338 "movq 8(%1), %%mm1 \n\t" | |
339 "addl %4, %1 \n\t" | |
340 PAVGB" (%2), %%mm0 \n\t" | |
341 PAVGB" 8(%2), %%mm1 \n\t" | |
342 PAVGB" (%3), %%mm0 \n\t" | |
343 PAVGB" 8(%3), %%mm1 \n\t" | |
344 "movq %%mm0, (%3) \n\t" | |
345 "movq %%mm1, 8(%3) \n\t" | |
346 "addl %5, %3 \n\t" | |
347 "movq (%1), %%mm0 \n\t" | |
348 "movq 8(%1), %%mm1 \n\t" | |
349 "addl %4, %1 \n\t" | |
350 PAVGB" 16(%2), %%mm0 \n\t" | |
351 PAVGB" 24(%2), %%mm1 \n\t" | |
352 PAVGB" (%3), %%mm0 \n\t" | |
353 PAVGB" 8(%3), %%mm1 \n\t" | |
354 "movq %%mm0, (%3) \n\t" | |
355 "movq %%mm1, 8(%3) \n\t" | |
356 "addl %5, %3 \n\t" | |
357 "addl $32, %2 \n\t" | |
358 "subl $2, %0 \n\t" | |
359 "jnz 1b \n\t" | |
360 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used | |
361 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
362 #else | |
363 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
364 #endif | |
365 :"S"(src1Stride), "D"(dstStride) | |
366 :"memory"); | |
367 //the following should be used, though better not with gcc ... | |
368 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
369 :"r"(src1Stride), "r"(dstStride) | |
370 :"memory");*/ | |
371 } | |
372 | |
373 static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
374 { | |
375 __asm __volatile( | |
376 "pcmpeqb %%mm6, %%mm6\n\t" | |
377 "testl $1, %0 \n\t" | |
378 " jz 1f \n\t" | |
379 "movq (%1), %%mm0 \n\t" | |
380 "movq 8(%1), %%mm1 \n\t" | |
381 "movq (%2), %%mm2 \n\t" | |
382 "movq 8(%2), %%mm3 \n\t" | |
383 "pxor %%mm6, %%mm0 \n\t" | |
384 "pxor %%mm6, %%mm1 \n\t" | |
385 "pxor %%mm6, %%mm2 \n\t" | |
386 "pxor %%mm6, %%mm3 \n\t" | |
387 PAVGB" %%mm2, %%mm0 \n\t" | |
388 PAVGB" %%mm3, %%mm1 \n\t" | |
389 "pxor %%mm6, %%mm0 \n\t" | |
390 "pxor %%mm6, %%mm1 \n\t" | |
391 "addl %4, %1 \n\t" | |
392 "addl $16, %2 \n\t" | |
393 "movq %%mm0, (%3) \n\t" | |
394 "movq %%mm1, 8(%3) \n\t" | |
395 "addl %5, %3 \n\t" | |
396 "decl %0 \n\t" | |
397 "1: \n\t" | |
398 "movq (%1), %%mm0 \n\t" | |
399 "movq 8(%1), %%mm1 \n\t" | |
400 "addl %4, %1 \n\t" | |
401 "movq (%2), %%mm2 \n\t" | |
402 "movq 8(%2), %%mm3 \n\t" | |
403 "pxor %%mm6, %%mm0 \n\t" | |
404 "pxor %%mm6, %%mm1 \n\t" | |
405 "pxor %%mm6, %%mm2 \n\t" | |
406 "pxor %%mm6, %%mm3 \n\t" | |
407 PAVGB" %%mm2, %%mm0 \n\t" | |
408 PAVGB" %%mm3, %%mm1 \n\t" | |
409 "pxor %%mm6, %%mm0 \n\t" | |
410 "pxor %%mm6, %%mm1 \n\t" | |
411 "movq %%mm0, (%3) \n\t" | |
412 "movq %%mm1, 8(%3) \n\t" | |
413 "addl %5, %3 \n\t" | |
414 "movq (%1), %%mm0 \n\t" | |
415 "movq 8(%1), %%mm1 \n\t" | |
416 "addl %4, %1 \n\t" | |
417 "movq 16(%2), %%mm2 \n\t" | |
418 "movq 24(%2), %%mm3 \n\t" | |
419 "pxor %%mm6, %%mm0 \n\t" | |
420 "pxor %%mm6, %%mm1 \n\t" | |
421 "pxor %%mm6, %%mm2 \n\t" | |
422 "pxor %%mm6, %%mm3 \n\t" | |
423 PAVGB" %%mm2, %%mm0 \n\t" | |
424 PAVGB" %%mm3, %%mm1 \n\t" | |
425 "pxor %%mm6, %%mm0 \n\t" | |
426 "pxor %%mm6, %%mm1 \n\t" | |
427 "movq %%mm0, (%3) \n\t" | |
428 "movq %%mm1, 8(%3) \n\t" | |
429 "addl %5, %3 \n\t" | |
430 "addl $32, %2 \n\t" | |
431 "subl $2, %0 \n\t" | |
432 "jnz 1b \n\t" | |
433 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used | |
434 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
435 #else | |
436 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
437 #endif | |
438 :"S"(src1Stride), "D"(dstStride) | |
439 :"memory"); | |
440 //the following should be used, though better not with gcc ... | |
441 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
442 :"r"(src1Stride), "r"(dstStride) | |
443 :"memory");*/ | |
154 } | 444 } |
155 | 445 |
156 /* GL: this function does incorrect rounding if overflow */ | 446 /* GL: this function does incorrect rounding if overflow */ |
157 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 447 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
158 { | 448 { |