Mercurial > mplayer.hg
comparison libvo/yuv2rgb_mmx.c @ 1306:7ce37211e454
yuv2rgb_mmx crashes with ffdivx codec, when we play back avi files that have
a frame width that is not an exact multiple of 8.
Testcase: 405.avi (356x240). Playing on an MMX capable x86 system using the
x11 video-out driver results in a segfault.
The MMX routines convert image data in quantities of 8 pixels in each loop,
and the inner loop was not terminated in case there are only 1-7 pixels left,
producing too much RGB output.
For now, just ignore the last few pixels on each row, to avoid the segfaults.
(Gives a black vertical border on the right, if you play a video with
width%8 != 0) A possible future enhancement would be, to add a second loop
to convert the last width%8 pixels to RGB using a byte loop.
author | jkeil |
---|---|
date | Thu, 12 Jul 2001 15:23:26 +0000 |
parents | 961f53221ffc |
children | ae2026ac39d4 |
comparison
equal
deleted
inserted
replaced
1305:0a8237e28ce0 | 1306:7ce37211e454 |
---|---|
74 uint8_t * pu, uint8_t * pv, | 74 uint8_t * pu, uint8_t * pv, |
75 int h_size, int v_size, | 75 int h_size, int v_size, |
76 int rgb_stride, int y_stride, int uv_stride) | 76 int rgb_stride, int y_stride, int uv_stride) |
77 { | 77 { |
78 int even = 1; | 78 int even = 1; |
79 int x = 0, y = 0; | 79 int x, y; |
80 | 80 |
81 /* load data for first scan line */ | 81 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); |
82 __asm__ __volatile__ ( | 82 |
83 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | 83 for (y = v_size; --y >= 0; ) { |
84 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | 84 uint8_t *_image = image; |
85 | 85 uint8_t *_py = py; |
86 "pxor %%mm4, %%mm4;" /* zero mm4 */ | 86 uint8_t *_pu = pu; |
87 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | 87 uint8_t *_pv = pv; |
88 | 88 |
89 //"movl $0, (%3);" /* cache preload for image */ | 89 /* load data for start of next scan line */ |
90 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); | 90 __asm__ __volatile__ ( |
91 | 91 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
92 do { | 92 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
93 do { | 93 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
94 | |
95 : : "r" (_py), "r" (_pu), "r" (_pv)); | |
96 | |
97 for (x = h_size >> 3; --x >= 0; ) { | |
94 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 | 98 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 |
95 pixels in each iteration */ | 99 pixels in each iteration */ |
96 __asm__ __volatile__ (".align 8;" | 100 |
101 __asm__ __volatile__ ( | |
97 /* Do the multiply part of the conversion for even and odd pixels, | 102 /* Do the multiply part of the conversion for even and odd pixels, |
98 register usage: | 103 register usage: |
99 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, | 104 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, |
100 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, | 105 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, |
101 mm6 -> Y even, mm7 -> Y odd */ | 106 mm6 -> Y even, mm7 -> Y odd */ |
197 | 202 |
198 "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ | 203 "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ |
199 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | 204 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
200 | 205 |
201 MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */ | 206 MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */ |
202 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); | 207 : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image)); |
203 | 208 |
204 py += 8; | 209 _py += 8; |
205 pu += 4; | 210 _pu += 4; |
206 pv += 4; | 211 _pv += 4; |
207 image += 16; | 212 _image += 16; |
208 x += 8; | |
209 } while (x < h_size); | |
210 | |
211 if (even) { | |
212 pu -= h_size/2; | |
213 pv -= h_size/2; | |
214 } else { | |
215 pu += (uv_stride - h_size/2); | |
216 pv += (uv_stride - h_size/2); | |
217 } | 213 } |
218 | 214 |
219 py += (y_stride - h_size); | 215 if (!even) { |
220 image += (rgb_stride - 2*h_size); | 216 pu += uv_stride; |
221 | 217 pv += uv_stride; |
222 /* load data for start of next scan line */ | 218 } |
223 __asm__ __volatile__ ( | 219 |
224 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 00 u3 u2 u1 u0 */ | 220 py += y_stride; |
225 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 00 v2 v1 v0 */ | 221 image += rgb_stride; |
226 | 222 |
227 //"movl $0, (%3);" /* cache preload for image */ | |
228 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | |
229 | |
230 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); | |
231 | |
232 x = 0; | |
233 y += 1; | |
234 even = (!even); | 223 even = (!even); |
235 } while (y < v_size) ; | 224 } |
236 | 225 |
237 __asm__ __volatile__ (EMMS); | 226 __asm__ __volatile__ (EMMS); |
238 } | 227 } |
239 | 228 |
240 static void yuv420_argb32_mmx (uint8_t * image, uint8_t * py, | 229 static void yuv420_argb32_mmx (uint8_t * image, uint8_t * py, |
241 uint8_t * pu, uint8_t * pv, | 230 uint8_t * pu, uint8_t * pv, |
242 int h_size, int v_size, | 231 int h_size, int v_size, |
243 int rgb_stride, int y_stride, int uv_stride) | 232 int rgb_stride, int y_stride, int uv_stride) |
244 { | 233 { |
245 int even = 1; | 234 int even = 1; |
246 int x = 0, y = 0; | 235 int x, y; |
247 | 236 |
248 __asm__ __volatile__ ( | 237 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); |
249 ".align 8;" | 238 |
239 for (y = v_size; --y >= 0; ) { | |
240 uint8_t *_image = image; | |
241 uint8_t *_py = py; | |
242 uint8_t *_pu = pu; | |
243 uint8_t *_pv = pv; | |
244 | |
245 /* load data for start of next scan line */ | |
246 __asm__ __volatile__ | |
247 ( | |
250 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | 248 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
251 //"movl $0, (%3);" /* cache preload for image */ | |
252 | |
253 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | 249 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
254 "pxor %%mm4, %%mm4;" /* zero mm4 */ | 250 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
255 | 251 : : "r" (_py), "r" (_pu), "r" (_pv) |
256 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | 252 ); |
257 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); | 253 |
258 | 254 for (x = h_size >> 3; --x >= 0; ) { |
259 do { | |
260 do { | |
261 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 | 255 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 |
262 pixels in each iteration */ | 256 pixels in each iteration */ |
263 __asm__ __volatile__ ( | 257 __asm__ __volatile__ ( |
264 ".align 8;" | |
265 /* Do the multiply part of the conversion for even and odd pixels, | 258 /* Do the multiply part of the conversion for even and odd pixels, |
266 register usage: | 259 register usage: |
267 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, | 260 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, |
268 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, | 261 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, |
269 mm6 -> Y even, mm7 -> Y odd */ | 262 mm6 -> Y even, mm7 -> Y odd */ |
377 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | 370 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
378 | 371 |
379 "pxor %%mm4, %%mm4;" /* zero mm4 */ | 372 "pxor %%mm4, %%mm4;" /* zero mm4 */ |
380 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | 373 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
381 | 374 |
382 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); | 375 : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image)); |
383 | 376 |
384 py += 8; | 377 _py += 8; |
385 pu += 4; | 378 _pu += 4; |
386 pv += 4; | 379 _pv += 4; |
387 image += 32; | 380 _image += 32; |
388 x += 8; | |
389 } while (x < h_size); | |
390 | |
391 if (even) { | |
392 pu -= h_size/2; | |
393 pv -= h_size/2; | |
394 } else { | |
395 pu += (uv_stride - h_size/2); | |
396 pv += (uv_stride - h_size/2); | |
397 } | 381 } |
398 | 382 |
399 py += (y_stride - h_size); | 383 if (!even) { |
400 image += (rgb_stride - 4*h_size); | 384 pu += uv_stride; |
401 | 385 pv += uv_stride; |
402 /* load data for start of next scan line */ | 386 } |
403 __asm__ __volatile__ | 387 |
404 ( | 388 py += y_stride; |
405 ".align 8;" | 389 image += rgb_stride; |
406 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | 390 |
407 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | |
408 | |
409 //"movl $0, (%3);" /* cache preload for image */ | |
410 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | |
411 : : "r" (py), "r" (pu), "r" (pv), "r" (image) | |
412 ); | |
413 | |
414 | |
415 x = 0; | |
416 y += 1; | |
417 even = (!even); | 391 even = (!even); |
418 } while ( y < v_size) ; | 392 } |
419 | 393 |
420 __asm__ __volatile__ (EMMS); | 394 __asm__ __volatile__ (EMMS); |
421 } | 395 } |
422 | 396 |
423 yuv2rgb_fun yuv2rgb_init_mmx (int bpp, int mode) | 397 yuv2rgb_fun yuv2rgb_init_mmx (int bpp, int mode) |