Mercurial > libavcodec.hg
comparison i386/dsputil_mmx_avg.h @ 651:45e8f39fda50 libavcodec
put/avg_pixels16
fixing 2 small qpel bugs
author | michaelni |
---|---|
date | Wed, 11 Sep 2002 12:39:53 +0000 |
parents | e8c8ca9106aa |
children | 13aec7e50c52 |
comparison
equal
deleted
inserted
replaced
650:ef4a33aad86e | 651:45e8f39fda50 |
---|---|
23 */ | 23 */ |
24 | 24 |
25 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm | 25 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm |
26 clobber bug - now it will work with 2.95.2 and also with -fPIC | 26 clobber bug - now it will work with 2.95.2 and also with -fPIC |
27 */ | 27 */ |
28 static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 28 static void DEF(put_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
29 { | 29 { |
30 __asm __volatile( | 30 __asm __volatile( |
31 "lea (%3, %3), %%eax \n\t" | 31 "lea (%3, %3), %%eax \n\t" |
32 "1: \n\t" | 32 "1: \n\t" |
33 "movq (%1), %%mm0 \n\t" | 33 "movq (%1), %%mm0 \n\t" |
34 "movq (%1, %3), %%mm1 \n\t" | 34 "movq (%1, %3), %%mm1 \n\t" |
35 PAVGB" 1(%1), %%mm0 \n\t" | 35 PAVGB" 1(%1), %%mm0 \n\t" |
36 PAVGB" 1(%1, %3), %%mm1 \n\t" | 36 PAVGB" 1(%1, %3), %%mm1 \n\t" |
37 "movq %%mm0, (%2) \n\t" | 37 "movq %%mm0, (%2) \n\t" |
38 "movq %%mm1, (%2, %3) \n\t" | 38 "movq %%mm1, (%2, %3) \n\t" |
39 "addl %%eax, %1 \n\t" | 39 "addl %%eax, %1 \n\t" |
40 "addl %%eax, %2 \n\t" | 40 "addl %%eax, %2 \n\t" |
41 "movq (%1), %%mm0 \n\t" | 41 "movq (%1), %%mm0 \n\t" |
42 "movq (%1, %3), %%mm1 \n\t" | 42 "movq (%1, %3), %%mm1 \n\t" |
43 PAVGB" 1(%1), %%mm0 \n\t" | 43 PAVGB" 1(%1), %%mm0 \n\t" |
44 PAVGB" 1(%1, %3), %%mm1 \n\t" | 44 PAVGB" 1(%1, %3), %%mm1 \n\t" |
45 "addl %%eax, %1 \n\t" | 45 "addl %%eax, %1 \n\t" |
46 "movq %%mm0, (%2) \n\t" | 46 "movq %%mm0, (%2) \n\t" |
47 "movq %%mm1, (%2, %3) \n\t" | 47 "movq %%mm1, (%2, %3) \n\t" |
48 "addl %%eax, %2 \n\t" | |
49 "subl $4, %0 \n\t" | |
50 "jnz 1b \n\t" | |
51 :"+g"(h), "+S"(pixels), "+D"(block) | |
52 :"r" (line_size) | |
53 :"%eax", "memory"); | |
54 } | |
55 | |
56 static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
57 { | |
58 __asm __volatile( | |
59 "lea (%3, %3), %%eax \n\t" | |
60 "1: \n\t" | |
61 "movq (%1), %%mm0 \n\t" | |
62 "movq (%1, %3), %%mm1 \n\t" | |
63 "movq 8(%1), %%mm2 \n\t" | |
64 "movq 8(%1, %3), %%mm3 \n\t" | |
65 PAVGB" 1(%1), %%mm0 \n\t" | |
66 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
67 PAVGB" 9(%1), %%mm2 \n\t" | |
68 PAVGB" 9(%1, %3), %%mm3 \n\t" | |
69 "movq %%mm0, (%2) \n\t" | |
70 "movq %%mm1, (%2, %3) \n\t" | |
71 "movq %%mm2, 8(%2) \n\t" | |
72 "movq %%mm3, 8(%2, %3) \n\t" | |
73 "addl %%eax, %1 \n\t" | |
74 "addl %%eax, %2 \n\t" | |
75 "movq (%1), %%mm0 \n\t" | |
76 "movq (%1, %3), %%mm1 \n\t" | |
77 "movq 8(%1), %%mm2 \n\t" | |
78 "movq 8(%1, %3), %%mm3 \n\t" | |
79 PAVGB" 1(%1), %%mm0 \n\t" | |
80 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
81 PAVGB" 9(%1), %%mm2 \n\t" | |
82 PAVGB" 9(%1, %3), %%mm3 \n\t" | |
83 "addl %%eax, %1 \n\t" | |
84 "movq %%mm0, (%2) \n\t" | |
85 "movq %%mm1, (%2, %3) \n\t" | |
86 "movq %%mm2, 8(%2) \n\t" | |
87 "movq %%mm3, 8(%2, %3) \n\t" | |
48 "addl %%eax, %2 \n\t" | 88 "addl %%eax, %2 \n\t" |
49 "subl $4, %0 \n\t" | 89 "subl $4, %0 \n\t" |
50 "jnz 1b \n\t" | 90 "jnz 1b \n\t" |
51 :"+g"(h), "+S"(pixels), "+D"(block) | 91 :"+g"(h), "+S"(pixels), "+D"(block) |
52 :"r" (line_size) | 92 :"r" (line_size) |
53 :"%eax", "memory"); | 93 :"%eax", "memory"); |
54 } | 94 } |
55 | 95 |
56 /* GL: this function does incorrect rounding if overflow */ | 96 /* GL: this function does incorrect rounding if overflow */ |
57 static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 97 static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
58 { | 98 { |
59 MOVQ_BONE(mm6); | 99 MOVQ_BONE(mm6); |
60 __asm __volatile( | 100 __asm __volatile( |
61 "lea (%3, %3), %%eax \n\t" | 101 "lea (%3, %3), %%eax \n\t" |
62 "1: \n\t" | 102 "1: \n\t" |
89 :"+g"(h), "+S"(pixels), "+D"(block) | 129 :"+g"(h), "+S"(pixels), "+D"(block) |
90 :"r" (line_size) | 130 :"r" (line_size) |
91 :"%eax", "memory"); | 131 :"%eax", "memory"); |
92 } | 132 } |
93 | 133 |
94 static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 134 static void DEF(put_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
95 { | 135 { |
96 __asm __volatile( | 136 __asm __volatile( |
97 "lea (%3, %3), %%eax \n\t" | 137 "lea (%3, %3), %%eax \n\t" |
98 "movq (%1), %%mm0 \n\t" | 138 "movq (%1), %%mm0 \n\t" |
99 "subl %3, %2 \n\t" | 139 "subl %3, %2 \n\t" |
120 :"r" (line_size) | 160 :"r" (line_size) |
121 :"%eax", "memory"); | 161 :"%eax", "memory"); |
122 } | 162 } |
123 | 163 |
124 /* GL: this function does incorrect rounding if overflow */ | 164 /* GL: this function does incorrect rounding if overflow */ |
125 static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 165 static void DEF(put_no_rnd_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
126 { | 166 { |
127 MOVQ_BONE(mm6); | 167 MOVQ_BONE(mm6); |
128 __asm __volatile( | 168 __asm __volatile( |
129 "lea (%3, %3), %%eax \n\t" | 169 "lea (%3, %3), %%eax \n\t" |
130 "movq (%1), %%mm0 \n\t" | 170 "movq (%1), %%mm0 \n\t" |
153 :"+g"(h), "+S"(pixels), "+D" (block) | 193 :"+g"(h), "+S"(pixels), "+D" (block) |
154 :"r" (line_size) | 194 :"r" (line_size) |
155 :"%eax", "memory"); | 195 :"%eax", "memory"); |
156 } | 196 } |
157 | 197 |
158 static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 198 static void DEF(avg_pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
159 { | 199 { |
160 __asm __volatile( | 200 __asm __volatile( |
161 "lea (%3, %3), %%eax \n\t" | 201 "lea (%3, %3), %%eax \n\t" |
162 "1: \n\t" | 202 "1: \n\t" |
163 "movq (%2), %%mm0 \n\t" | 203 "movq (%2), %%mm0 \n\t" |
181 :"+g"(h), "+S"(pixels), "+D"(block) | 221 :"+g"(h), "+S"(pixels), "+D"(block) |
182 :"r" (line_size) | 222 :"r" (line_size) |
183 :"%eax", "memory"); | 223 :"%eax", "memory"); |
184 } | 224 } |
185 | 225 |
186 static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 226 static void DEF(avg_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
187 { | 227 { |
188 __asm __volatile( | 228 __asm __volatile( |
189 "lea (%3, %3), %%eax \n\t" | 229 "lea (%3, %3), %%eax \n\t" |
190 "1: \n\t" | 230 "1: \n\t" |
191 "movq (%1), %%mm0 \n\t" | 231 "movq (%1), %%mm0 \n\t" |
213 :"+g"(h), "+S"(pixels), "+D"(block) | 253 :"+g"(h), "+S"(pixels), "+D"(block) |
214 :"r" (line_size) | 254 :"r" (line_size) |
215 :"%eax", "memory"); | 255 :"%eax", "memory"); |
216 } | 256 } |
217 | 257 |
218 static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 258 static void DEF(avg_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
219 { | 259 { |
220 __asm __volatile( | 260 __asm __volatile( |
221 "lea (%3, %3), %%eax \n\t" | 261 "lea (%3, %3), %%eax \n\t" |
222 "movq (%1), %%mm0 \n\t" | 262 "movq (%1), %%mm0 \n\t" |
223 "subl %3, %2 \n\t" | 263 "subl %3, %2 \n\t" |
252 :"r" (line_size) | 292 :"r" (line_size) |
253 :"%eax", "memory"); | 293 :"%eax", "memory"); |
254 } | 294 } |
255 | 295 |
256 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter | 296 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter |
257 static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 297 static void DEF(avg_pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
258 { | 298 { |
259 MOVQ_BONE(mm6); | 299 MOVQ_BONE(mm6); |
260 __asm __volatile( | 300 __asm __volatile( |
261 "lea (%3, %3), %%eax \n\t" | 301 "lea (%3, %3), %%eax \n\t" |
262 "movq (%1), %%mm0 \n\t" | 302 "movq (%1), %%mm0 \n\t" |
292 "jnz 1b \n\t" | 332 "jnz 1b \n\t" |
293 :"+g"(h), "+S"(pixels), "+D"(block) | 333 :"+g"(h), "+S"(pixels), "+D"(block) |
294 :"r" (line_size) | 334 :"r" (line_size) |
295 :"%eax", "memory"); | 335 :"%eax", "memory"); |
296 } | 336 } |
337 | |
338 //FIXME the following could be optimized too ... | |
339 static void DEF(put_no_rnd_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ | |
340 DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); | |
341 DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); | |
342 } | |
343 static void DEF(put_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ | |
344 DEF(put_pixels8_y2)(block , pixels , line_size, h); | |
345 DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h); | |
346 } | |
347 static void DEF(put_no_rnd_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ | |
348 DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h); | |
349 DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); | |
350 } | |
351 static void DEF(avg_pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ | |
352 DEF(avg_pixels8)(block , pixels , line_size, h); | |
353 DEF(avg_pixels8)(block+8, pixels+8, line_size, h); | |
354 } | |
355 static void DEF(avg_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ | |
356 DEF(avg_pixels8_x2)(block , pixels , line_size, h); | |
357 DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h); | |
358 } | |
359 static void DEF(avg_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ | |
360 DEF(avg_pixels8_y2)(block , pixels , line_size, h); | |
361 DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h); | |
362 } | |
363 static void DEF(avg_pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ | |
364 DEF(avg_pixels8_xy2)(block , pixels , line_size, h); | |
365 DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); | |
366 } | |
367 |