comparison i386/dsputil_mmx_avg.h @ 2207:22b768f1261a libavcodec

10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster now
author michael
date Mon, 06 Sep 2004 03:17:31 +0000
parents f65d87bfdd5a
children c4a476971abc
comparison
equal deleted inserted replaced
2206:713ad427a3c7 2207:22b768f1261a
51 :"+g"(h), "+S"(pixels), "+D"(block) 51 :"+g"(h), "+S"(pixels), "+D"(block)
52 :"r" (line_size) 52 :"r" (line_size)
53 :"%eax", "memory"); 53 :"%eax", "memory");
54 } 54 }
55 55
56 static __attribute__((unused)) void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 56 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
57 { 57 {
58 __asm __volatile( 58 __asm __volatile(
59 "testl $1, %0 \n\t"
60 " jz 1f \n\t"
61 "movq (%1), %%mm0 \n\t"
62 "movq (%2), %%mm1 \n\t"
63 "addl %4, %1 \n\t"
64 "addl $8, %2 \n\t"
65 PAVGB" %%mm1, %%mm0 \n\t"
66 "movq %%mm0, (%3) \n\t"
67 "addl %5, %3 \n\t"
68 "decl %0 \n\t"
59 "1: \n\t" 69 "1: \n\t"
60 "movq (%1), %%mm0 \n\t" 70 "movq (%1), %%mm0 \n\t"
61 "addl %4, %1 \n\t" 71 "addl %4, %1 \n\t"
62 "movq (%1), %%mm1 \n\t" 72 "movq (%1), %%mm1 \n\t"
63 "addl %4, %1 \n\t" 73 "addl %4, %1 \n\t"
78 "movq %%mm1, (%3) \n\t" 88 "movq %%mm1, (%3) \n\t"
79 "addl %5, %3 \n\t" 89 "addl %5, %3 \n\t"
80 "addl $32, %2 \n\t" 90 "addl $32, %2 \n\t"
81 "subl $4, %0 \n\t" 91 "subl $4, %0 \n\t"
82 "jnz 1b \n\t" 92 "jnz 1b \n\t"
83 :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) 93 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
94 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
95 #else
96 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
97 #endif
98 :"S"(src1Stride), "D"(dstStride)
99 :"memory");
100 //the following should be used, though better not with gcc ...
101 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
84 :"r"(src1Stride), "r"(dstStride) 102 :"r"(src1Stride), "r"(dstStride)
85 :"memory"); 103 :"memory");*/
104 }
105
106 static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
107 {
108 __asm __volatile(
109 "pcmpeqb %%mm6, %%mm6 \n\t"
110 "testl $1, %0 \n\t"
111 " jz 1f \n\t"
112 "movq (%1), %%mm0 \n\t"
113 "movq (%2), %%mm1 \n\t"
114 "addl %4, %1 \n\t"
115 "addl $8, %2 \n\t"
116 "pxor %%mm6, %%mm0 \n\t"
117 "pxor %%mm6, %%mm1 \n\t"
118 PAVGB" %%mm1, %%mm0 \n\t"
119 "pxor %%mm6, %%mm0 \n\t"
120 "movq %%mm0, (%3) \n\t"
121 "addl %5, %3 \n\t"
122 "decl %0 \n\t"
123 "1: \n\t"
124 "movq (%1), %%mm0 \n\t"
125 "addl %4, %1 \n\t"
126 "movq (%1), %%mm1 \n\t"
127 "addl %4, %1 \n\t"
128 "movq (%2), %%mm2 \n\t"
129 "movq 8(%2), %%mm3 \n\t"
130 "pxor %%mm6, %%mm0 \n\t"
131 "pxor %%mm6, %%mm1 \n\t"
132 "pxor %%mm6, %%mm2 \n\t"
133 "pxor %%mm6, %%mm3 \n\t"
134 PAVGB" %%mm2, %%mm0 \n\t"
135 PAVGB" %%mm3, %%mm1 \n\t"
136 "pxor %%mm6, %%mm0 \n\t"
137 "pxor %%mm6, %%mm1 \n\t"
138 "movq %%mm0, (%3) \n\t"
139 "addl %5, %3 \n\t"
140 "movq %%mm1, (%3) \n\t"
141 "addl %5, %3 \n\t"
142 "movq (%1), %%mm0 \n\t"
143 "addl %4, %1 \n\t"
144 "movq (%1), %%mm1 \n\t"
145 "addl %4, %1 \n\t"
146 "movq 16(%2), %%mm2 \n\t"
147 "movq 24(%2), %%mm3 \n\t"
148 "pxor %%mm6, %%mm0 \n\t"
149 "pxor %%mm6, %%mm1 \n\t"
150 "pxor %%mm6, %%mm2 \n\t"
151 "pxor %%mm6, %%mm3 \n\t"
152 PAVGB" %%mm2, %%mm0 \n\t"
153 PAVGB" %%mm3, %%mm1 \n\t"
154 "pxor %%mm6, %%mm0 \n\t"
155 "pxor %%mm6, %%mm1 \n\t"
156 "movq %%mm0, (%3) \n\t"
157 "addl %5, %3 \n\t"
158 "movq %%mm1, (%3) \n\t"
159 "addl %5, %3 \n\t"
160 "addl $32, %2 \n\t"
161 "subl $4, %0 \n\t"
162 "jnz 1b \n\t"
163 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
164 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
165 #else
166 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
167 #endif
168 :"S"(src1Stride), "D"(dstStride)
169 :"memory");
170 //the following should be used, though better not with gcc ...
171 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
172 :"r"(src1Stride), "r"(dstStride)
173 :"memory");*/
174 }
175
176 static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
177 {
178 __asm __volatile(
179 "testl $1, %0 \n\t"
180 " jz 1f \n\t"
181 "movq (%1), %%mm0 \n\t"
182 "movq (%2), %%mm1 \n\t"
183 "addl %4, %1 \n\t"
184 "addl $8, %2 \n\t"
185 PAVGB" %%mm1, %%mm0 \n\t"
186 PAVGB" (%3), %%mm0 \n\t"
187 "movq %%mm0, (%3) \n\t"
188 "addl %5, %3 \n\t"
189 "decl %0 \n\t"
190 "1: \n\t"
191 "movq (%1), %%mm0 \n\t"
192 "addl %4, %1 \n\t"
193 "movq (%1), %%mm1 \n\t"
194 "addl %4, %1 \n\t"
195 PAVGB" (%2), %%mm0 \n\t"
196 PAVGB" 8(%2), %%mm1 \n\t"
197 PAVGB" (%3), %%mm0 \n\t"
198 "movq %%mm0, (%3) \n\t"
199 "addl %5, %3 \n\t"
200 PAVGB" (%3), %%mm1 \n\t"
201 "movq %%mm1, (%3) \n\t"
202 "addl %5, %3 \n\t"
203 "movq (%1), %%mm0 \n\t"
204 "addl %4, %1 \n\t"
205 "movq (%1), %%mm1 \n\t"
206 "addl %4, %1 \n\t"
207 PAVGB" 16(%2), %%mm0 \n\t"
208 PAVGB" 24(%2), %%mm1 \n\t"
209 PAVGB" (%3), %%mm0 \n\t"
210 "movq %%mm0, (%3) \n\t"
211 "addl %5, %3 \n\t"
212 PAVGB" (%3), %%mm1 \n\t"
213 "movq %%mm1, (%3) \n\t"
214 "addl %5, %3 \n\t"
215 "addl $32, %2 \n\t"
216 "subl $4, %0 \n\t"
217 "jnz 1b \n\t"
218 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
219 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
220 #else
221 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
222 #endif
223 :"S"(src1Stride), "D"(dstStride)
224 :"memory");
225 //the following should be used, though better not with gcc ...
226 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
227 :"r"(src1Stride), "r"(dstStride)
228 :"memory");*/
86 } 229 }
87 230
88 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 231 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
89 { 232 {
90 __asm __volatile( 233 __asm __volatile(
123 :"+g"(h), "+S"(pixels), "+D"(block) 266 :"+g"(h), "+S"(pixels), "+D"(block)
124 :"r" (line_size) 267 :"r" (line_size)
125 :"%eax", "memory"); 268 :"%eax", "memory");
126 } 269 }
127 270
128 static __attribute__((unused)) void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 271 static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
129 { 272 {
130 __asm __volatile( 273 __asm __volatile(
274 "testl $1, %0 \n\t"
275 " jz 1f \n\t"
276 "movq (%1), %%mm0 \n\t"
277 "movq 8(%1), %%mm1 \n\t"
278 PAVGB" (%2), %%mm0 \n\t"
279 PAVGB" 8(%2), %%mm1 \n\t"
280 "addl %4, %1 \n\t"
281 "addl $16, %2 \n\t"
282 "movq %%mm0, (%3) \n\t"
283 "movq %%mm1, 8(%3) \n\t"
284 "addl %5, %3 \n\t"
285 "decl %0 \n\t"
131 "1: \n\t" 286 "1: \n\t"
132 "movq (%1), %%mm0 \n\t" 287 "movq (%1), %%mm0 \n\t"
133 "movq 8(%1), %%mm1 \n\t" 288 "movq 8(%1), %%mm1 \n\t"
134 "addl %4, %1 \n\t" 289 "addl %4, %1 \n\t"
135 PAVGB" (%2), %%mm0 \n\t" 290 PAVGB" (%2), %%mm0 \n\t"
146 "movq %%mm1, 8(%3) \n\t" 301 "movq %%mm1, 8(%3) \n\t"
147 "addl %5, %3 \n\t" 302 "addl %5, %3 \n\t"
148 "addl $32, %2 \n\t" 303 "addl $32, %2 \n\t"
149 "subl $2, %0 \n\t" 304 "subl $2, %0 \n\t"
150 "jnz 1b \n\t" 305 "jnz 1b \n\t"
151 :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) 306 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
307 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
308 #else
309 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
310 #endif
311 :"S"(src1Stride), "D"(dstStride)
312 :"memory");
313 //the following should be used, though better not with gcc ...
314 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
152 :"r"(src1Stride), "r"(dstStride) 315 :"r"(src1Stride), "r"(dstStride)
153 :"memory"); 316 :"memory");*/
317 }
318
319 static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
320 {
321 __asm __volatile(
322 "testl $1, %0 \n\t"
323 " jz 1f \n\t"
324 "movq (%1), %%mm0 \n\t"
325 "movq 8(%1), %%mm1 \n\t"
326 PAVGB" (%2), %%mm0 \n\t"
327 PAVGB" 8(%2), %%mm1 \n\t"
328 "addl %4, %1 \n\t"
329 "addl $16, %2 \n\t"
330 PAVGB" (%3), %%mm0 \n\t"
331 PAVGB" 8(%3), %%mm1 \n\t"
332 "movq %%mm0, (%3) \n\t"
333 "movq %%mm1, 8(%3) \n\t"
334 "addl %5, %3 \n\t"
335 "decl %0 \n\t"
336 "1: \n\t"
337 "movq (%1), %%mm0 \n\t"
338 "movq 8(%1), %%mm1 \n\t"
339 "addl %4, %1 \n\t"
340 PAVGB" (%2), %%mm0 \n\t"
341 PAVGB" 8(%2), %%mm1 \n\t"
342 PAVGB" (%3), %%mm0 \n\t"
343 PAVGB" 8(%3), %%mm1 \n\t"
344 "movq %%mm0, (%3) \n\t"
345 "movq %%mm1, 8(%3) \n\t"
346 "addl %5, %3 \n\t"
347 "movq (%1), %%mm0 \n\t"
348 "movq 8(%1), %%mm1 \n\t"
349 "addl %4, %1 \n\t"
350 PAVGB" 16(%2), %%mm0 \n\t"
351 PAVGB" 24(%2), %%mm1 \n\t"
352 PAVGB" (%3), %%mm0 \n\t"
353 PAVGB" 8(%3), %%mm1 \n\t"
354 "movq %%mm0, (%3) \n\t"
355 "movq %%mm1, 8(%3) \n\t"
356 "addl %5, %3 \n\t"
357 "addl $32, %2 \n\t"
358 "subl $2, %0 \n\t"
359 "jnz 1b \n\t"
360 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
361 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
362 #else
363 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
364 #endif
365 :"S"(src1Stride), "D"(dstStride)
366 :"memory");
367 //the following should be used, though better not with gcc ...
368 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
369 :"r"(src1Stride), "r"(dstStride)
370 :"memory");*/
371 }
372
373 static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
374 {
375 __asm __volatile(
376 "pcmpeqb %%mm6, %%mm6\n\t"
377 "testl $1, %0 \n\t"
378 " jz 1f \n\t"
379 "movq (%1), %%mm0 \n\t"
380 "movq 8(%1), %%mm1 \n\t"
381 "movq (%2), %%mm2 \n\t"
382 "movq 8(%2), %%mm3 \n\t"
383 "pxor %%mm6, %%mm0 \n\t"
384 "pxor %%mm6, %%mm1 \n\t"
385 "pxor %%mm6, %%mm2 \n\t"
386 "pxor %%mm6, %%mm3 \n\t"
387 PAVGB" %%mm2, %%mm0 \n\t"
388 PAVGB" %%mm3, %%mm1 \n\t"
389 "pxor %%mm6, %%mm0 \n\t"
390 "pxor %%mm6, %%mm1 \n\t"
391 "addl %4, %1 \n\t"
392 "addl $16, %2 \n\t"
393 "movq %%mm0, (%3) \n\t"
394 "movq %%mm1, 8(%3) \n\t"
395 "addl %5, %3 \n\t"
396 "decl %0 \n\t"
397 "1: \n\t"
398 "movq (%1), %%mm0 \n\t"
399 "movq 8(%1), %%mm1 \n\t"
400 "addl %4, %1 \n\t"
401 "movq (%2), %%mm2 \n\t"
402 "movq 8(%2), %%mm3 \n\t"
403 "pxor %%mm6, %%mm0 \n\t"
404 "pxor %%mm6, %%mm1 \n\t"
405 "pxor %%mm6, %%mm2 \n\t"
406 "pxor %%mm6, %%mm3 \n\t"
407 PAVGB" %%mm2, %%mm0 \n\t"
408 PAVGB" %%mm3, %%mm1 \n\t"
409 "pxor %%mm6, %%mm0 \n\t"
410 "pxor %%mm6, %%mm1 \n\t"
411 "movq %%mm0, (%3) \n\t"
412 "movq %%mm1, 8(%3) \n\t"
413 "addl %5, %3 \n\t"
414 "movq (%1), %%mm0 \n\t"
415 "movq 8(%1), %%mm1 \n\t"
416 "addl %4, %1 \n\t"
417 "movq 16(%2), %%mm2 \n\t"
418 "movq 24(%2), %%mm3 \n\t"
419 "pxor %%mm6, %%mm0 \n\t"
420 "pxor %%mm6, %%mm1 \n\t"
421 "pxor %%mm6, %%mm2 \n\t"
422 "pxor %%mm6, %%mm3 \n\t"
423 PAVGB" %%mm2, %%mm0 \n\t"
424 PAVGB" %%mm3, %%mm1 \n\t"
425 "pxor %%mm6, %%mm0 \n\t"
426 "pxor %%mm6, %%mm1 \n\t"
427 "movq %%mm0, (%3) \n\t"
428 "movq %%mm1, 8(%3) \n\t"
429 "addl %5, %3 \n\t"
430 "addl $32, %2 \n\t"
431 "subl $2, %0 \n\t"
432 "jnz 1b \n\t"
433 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
434 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
435 #else
436 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
437 #endif
438 :"S"(src1Stride), "D"(dstStride)
439 :"memory");
440 //the following should be used, though better not with gcc ...
441 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
442 :"r"(src1Stride), "r"(dstStride)
443 :"memory");*/
154 } 444 }
155 445
156 /* GL: this function does incorrect rounding if overflow */ 446 /* GL: this function does incorrect rounding if overflow */
157 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 447 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
158 { 448 {