comparison i386/dsputil_mmx_avg.h @ 651:45e8f39fda50 libavcodec

put/avg_pixels16 fixing 2 small qpel bugs
author michaelni
date Wed, 11 Sep 2002 12:39:53 +0000
parents e8c8ca9106aa
children 13aec7e50c52
comparison
equal deleted inserted replaced
650:ef4a33aad86e 651:45e8f39fda50
23 */ 23 */
24 24
25 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm 25 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
26 clobber bug - now it will work with 2.95.2 and also with -fPIC 26 clobber bug - now it will work with 2.95.2 and also with -fPIC
27 */ 27 */
28 static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) 28 static void DEF(put_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
29 { 29 {
30 __asm __volatile( 30 __asm __volatile(
31 "lea (%3, %3), %%eax \n\t" 31 "lea (%3, %3), %%eax \n\t"
32 "1: \n\t" 32 "1: \n\t"
33 "movq (%1), %%mm0 \n\t" 33 "movq (%1), %%mm0 \n\t"
34 "movq (%1, %3), %%mm1 \n\t" 34 "movq (%1, %3), %%mm1 \n\t"
35 PAVGB" 1(%1), %%mm0 \n\t" 35 PAVGB" 1(%1), %%mm0 \n\t"
36 PAVGB" 1(%1, %3), %%mm1 \n\t" 36 PAVGB" 1(%1, %3), %%mm1 \n\t"
37 "movq %%mm0, (%2) \n\t" 37 "movq %%mm0, (%2) \n\t"
38 "movq %%mm1, (%2, %3) \n\t" 38 "movq %%mm1, (%2, %3) \n\t"
39 "addl %%eax, %1 \n\t" 39 "addl %%eax, %1 \n\t"
40 "addl %%eax, %2 \n\t" 40 "addl %%eax, %2 \n\t"
41 "movq (%1), %%mm0 \n\t" 41 "movq (%1), %%mm0 \n\t"
42 "movq (%1, %3), %%mm1 \n\t" 42 "movq (%1, %3), %%mm1 \n\t"
43 PAVGB" 1(%1), %%mm0 \n\t" 43 PAVGB" 1(%1), %%mm0 \n\t"
44 PAVGB" 1(%1, %3), %%mm1 \n\t" 44 PAVGB" 1(%1, %3), %%mm1 \n\t"
45 "addl %%eax, %1 \n\t" 45 "addl %%eax, %1 \n\t"
46 "movq %%mm0, (%2) \n\t" 46 "movq %%mm0, (%2) \n\t"
47 "movq %%mm1, (%2, %3) \n\t" 47 "movq %%mm1, (%2, %3) \n\t"
48 "addl %%eax, %2 \n\t"
49 "subl $4, %0 \n\t"
50 "jnz 1b \n\t"
51 :"+g"(h), "+S"(pixels), "+D"(block)
52 :"r" (line_size)
53 :"%eax", "memory");
54 }
55
56 static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
57 {
58 __asm __volatile(
59 "lea (%3, %3), %%eax \n\t"
60 "1: \n\t"
61 "movq (%1), %%mm0 \n\t"
62 "movq (%1, %3), %%mm1 \n\t"
63 "movq 8(%1), %%mm2 \n\t"
64 "movq 8(%1, %3), %%mm3 \n\t"
65 PAVGB" 1(%1), %%mm0 \n\t"
66 PAVGB" 1(%1, %3), %%mm1 \n\t"
67 PAVGB" 9(%1), %%mm2 \n\t"
68 PAVGB" 9(%1, %3), %%mm3 \n\t"
69 "movq %%mm0, (%2) \n\t"
70 "movq %%mm1, (%2, %3) \n\t"
71 "movq %%mm2, 8(%2) \n\t"
72 "movq %%mm3, 8(%2, %3) \n\t"
73 "addl %%eax, %1 \n\t"
74 "addl %%eax, %2 \n\t"
75 "movq (%1), %%mm0 \n\t"
76 "movq (%1, %3), %%mm1 \n\t"
77 "movq 8(%1), %%mm2 \n\t"
78 "movq 8(%1, %3), %%mm3 \n\t"
79 PAVGB" 1(%1), %%mm0 \n\t"
80 PAVGB" 1(%1, %3), %%mm1 \n\t"
81 PAVGB" 9(%1), %%mm2 \n\t"
82 PAVGB" 9(%1, %3), %%mm3 \n\t"
83 "addl %%eax, %1 \n\t"
84 "movq %%mm0, (%2) \n\t"
85 "movq %%mm1, (%2, %3) \n\t"
86 "movq %%mm2, 8(%2) \n\t"
87 "movq %%mm3, 8(%2, %3) \n\t"
48 "addl %%eax, %2 \n\t" 88 "addl %%eax, %2 \n\t"
49 "subl $4, %0 \n\t" 89 "subl $4, %0 \n\t"
50 "jnz 1b \n\t" 90 "jnz 1b \n\t"
51 :"+g"(h), "+S"(pixels), "+D"(block) 91 :"+g"(h), "+S"(pixels), "+D"(block)
52 :"r" (line_size) 92 :"r" (line_size)
53 :"%eax", "memory"); 93 :"%eax", "memory");
54 } 94 }
55 95
56 /* GL: this function does incorrect rounding if overflow */ 96 /* GL: this function does incorrect rounding if overflow */
57 static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) 97 static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
58 { 98 {
59 MOVQ_BONE(mm6); 99 MOVQ_BONE(mm6);
60 __asm __volatile( 100 __asm __volatile(
61 "lea (%3, %3), %%eax \n\t" 101 "lea (%3, %3), %%eax \n\t"
62 "1: \n\t" 102 "1: \n\t"
89 :"+g"(h), "+S"(pixels), "+D"(block) 129 :"+g"(h), "+S"(pixels), "+D"(block)
90 :"r" (line_size) 130 :"r" (line_size)
91 :"%eax", "memory"); 131 :"%eax", "memory");
92 } 132 }
93 133
94 static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) 134 static void DEF(put_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
95 { 135 {
96 __asm __volatile( 136 __asm __volatile(
97 "lea (%3, %3), %%eax \n\t" 137 "lea (%3, %3), %%eax \n\t"
98 "movq (%1), %%mm0 \n\t" 138 "movq (%1), %%mm0 \n\t"
99 "subl %3, %2 \n\t" 139 "subl %3, %2 \n\t"
120 :"r" (line_size) 160 :"r" (line_size)
121 :"%eax", "memory"); 161 :"%eax", "memory");
122 } 162 }
123 163
124 /* GL: this function does incorrect rounding if overflow */ 164 /* GL: this function does incorrect rounding if overflow */
125 static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) 165 static void DEF(put_no_rnd_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
126 { 166 {
127 MOVQ_BONE(mm6); 167 MOVQ_BONE(mm6);
128 __asm __volatile( 168 __asm __volatile(
129 "lea (%3, %3), %%eax \n\t" 169 "lea (%3, %3), %%eax \n\t"
130 "movq (%1), %%mm0 \n\t" 170 "movq (%1), %%mm0 \n\t"
153 :"+g"(h), "+S"(pixels), "+D" (block) 193 :"+g"(h), "+S"(pixels), "+D" (block)
154 :"r" (line_size) 194 :"r" (line_size)
155 :"%eax", "memory"); 195 :"%eax", "memory");
156 } 196 }
157 197
158 static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) 198 static void DEF(avg_pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
159 { 199 {
160 __asm __volatile( 200 __asm __volatile(
161 "lea (%3, %3), %%eax \n\t" 201 "lea (%3, %3), %%eax \n\t"
162 "1: \n\t" 202 "1: \n\t"
163 "movq (%2), %%mm0 \n\t" 203 "movq (%2), %%mm0 \n\t"
181 :"+g"(h), "+S"(pixels), "+D"(block) 221 :"+g"(h), "+S"(pixels), "+D"(block)
182 :"r" (line_size) 222 :"r" (line_size)
183 :"%eax", "memory"); 223 :"%eax", "memory");
184 } 224 }
185 225
186 static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) 226 static void DEF(avg_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
187 { 227 {
188 __asm __volatile( 228 __asm __volatile(
189 "lea (%3, %3), %%eax \n\t" 229 "lea (%3, %3), %%eax \n\t"
190 "1: \n\t" 230 "1: \n\t"
191 "movq (%1), %%mm0 \n\t" 231 "movq (%1), %%mm0 \n\t"
213 :"+g"(h), "+S"(pixels), "+D"(block) 253 :"+g"(h), "+S"(pixels), "+D"(block)
214 :"r" (line_size) 254 :"r" (line_size)
215 :"%eax", "memory"); 255 :"%eax", "memory");
216 } 256 }
217 257
218 static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) 258 static void DEF(avg_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
219 { 259 {
220 __asm __volatile( 260 __asm __volatile(
221 "lea (%3, %3), %%eax \n\t" 261 "lea (%3, %3), %%eax \n\t"
222 "movq (%1), %%mm0 \n\t" 262 "movq (%1), %%mm0 \n\t"
223 "subl %3, %2 \n\t" 263 "subl %3, %2 \n\t"
252 :"r" (line_size) 292 :"r" (line_size)
253 :"%eax", "memory"); 293 :"%eax", "memory");
254 } 294 }
255 295
256 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter 296 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
257 static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) 297 static void DEF(avg_pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
258 { 298 {
259 MOVQ_BONE(mm6); 299 MOVQ_BONE(mm6);
260 __asm __volatile( 300 __asm __volatile(
261 "lea (%3, %3), %%eax \n\t" 301 "lea (%3, %3), %%eax \n\t"
262 "movq (%1), %%mm0 \n\t" 302 "movq (%1), %%mm0 \n\t"
292 "jnz 1b \n\t" 332 "jnz 1b \n\t"
293 :"+g"(h), "+S"(pixels), "+D"(block) 333 :"+g"(h), "+S"(pixels), "+D"(block)
294 :"r" (line_size) 334 :"r" (line_size)
295 :"%eax", "memory"); 335 :"%eax", "memory");
296 } 336 }
337
338 //FIXME the following could be optimized too ...
339 static void DEF(put_no_rnd_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
340 DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
341 DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
342 }
343 static void DEF(put_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
344 DEF(put_pixels8_y2)(block , pixels , line_size, h);
345 DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
346 }
347 static void DEF(put_no_rnd_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
348 DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
349 DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
350 }
351 static void DEF(avg_pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
352 DEF(avg_pixels8)(block , pixels , line_size, h);
353 DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
354 }
355 static void DEF(avg_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
356 DEF(avg_pixels8_x2)(block , pixels , line_size, h);
357 DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
358 }
359 static void DEF(avg_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
360 DEF(avg_pixels8_y2)(block , pixels , line_size, h);
361 DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
362 }
363 static void DEF(avg_pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
364 DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
365 DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
366 }
367