# HG changeset patch # User jkeil # Date 994951406 0 # Node ID 7ce37211e454f40a99fb1d513c2b6e84074242cf # Parent 0a8237e28ce095f569f2527bf967811bdf5eacec yuv2rgb_mmx crashes with ffdivx codec, when we play back avi files that have a frame width that is not an exact multiple of 8. Testcase: 405.avi (356x240). Playing on an MMX capable x86 system using the x11 video-out driver results in a segfault. The MMX routines convert image data in quantities of 8 pixels in each loop, and the inner loop was not terminated in case there are only 1-7 pixels left, producing too much RGB output. For now, just ignore the last few pixels on each row, to avoid the segfaults. (Gives a black vertical border on the right, if you play a video with width%8 != 0) A possible future enhancement would be, to add a second loop to convert the last width%8 pixels to RGB using a byte loop. diff -r 0a8237e28ce0 -r 7ce37211e454 libvo/yuv2rgb_mmx.c --- a/libvo/yuv2rgb_mmx.c Thu Jul 12 15:10:06 2001 +0000 +++ b/libvo/yuv2rgb_mmx.c Thu Jul 12 15:23:26 2001 +0000 @@ -76,24 +76,29 @@ int rgb_stride, int y_stride, int uv_stride) { int even = 1; - int x = 0, y = 0; + int x, y; + + __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); - /* load data for first scan line */ - __asm__ __volatile__ ( - "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ - "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ + for (y = v_size; --y >= 0; ) { + uint8_t *_image = image; + uint8_t *_py = py; + uint8_t *_pu = pu; + uint8_t *_pv = pv; - "pxor %%mm4, %%mm4;" /* zero mm4 */ - "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ + /* load data for start of next scan line */ + __asm__ __volatile__ ( + "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ + "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ + "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ - //"movl $0, (%3);" /* cache preload for image */ - : : "r" (py), "r" (pu), "r" (pv), "r" (image)); + : : "r" (_py), "r" (_pu), "r" (_pv)); - do { - do { + for (x = h_size >> 3; --x >= 0; ) { /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 pixels in each iteration */ - __asm__ __volatile__ (".align 8;" + + __asm__ __volatile__ ( /* Do the multiply part of the conversion for even and odd pixels, register usage: mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, @@ -199,40 +204,24 @@ "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */ - : : "r" (py), "r" (pu), "r" (pv), "r" (image)); + : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image)); - py += 8; - pu += 4; - pv += 4; - image += 16; - x += 8; - } while (x < h_size); - - if (even) { - pu -= h_size/2; - pv -= h_size/2; - } else { - pu += (uv_stride - h_size/2); - pv += (uv_stride - h_size/2); + _py += 8; + _pu += 4; + _pv += 4; + _image += 16; } - py += (y_stride - h_size); - image += (rgb_stride - 2*h_size); - - /* load data for start of next scan line */ - __asm__ __volatile__ ( - "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 00 u3 u2 u1 u0 */ - "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 00 v2 v1 v0 */ + if (!even) { + pu += uv_stride; + pv += uv_stride; + } - //"movl $0, (%3);" /* cache preload for image */ - "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ - - : : "r" (py), "r" (pu), "r" (pv), "r" (image)); + py += y_stride; + image += rgb_stride; - x = 0; - y += 1; even = (!even); - } while (y < v_size) ; + } __asm__ __volatile__ (EMMS); } @@ -243,25 +232,29 @@ int rgb_stride, int y_stride, int uv_stride) { int even = 1; - int x = 0, y = 0; + int x, y; + + __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); - __asm__ __volatile__ ( - ".align 8;" + for (y = v_size; --y >= 0; ) { + uint8_t *_image = image; + uint8_t *_py = py; + uint8_t *_pu = pu; + uint8_t *_pv = pv; + + /* load data for start of next scan line */ + __asm__ __volatile__ + ( "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ - //"movl $0, (%3);" /* cache preload for image */ - "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ - "pxor %%mm4, %%mm4;" /* zero mm4 */ + "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ + : : "r" (_py), "r" (_pu), "r" (_pv) + ); - "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ - : : "r" (py), "r" (pu), "r" (pv), "r" (image)); - - do { - do { + for (x = h_size >> 3; --x >= 0; ) { /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 pixels in each iteration */ __asm__ __volatile__ ( - ".align 8;" /* Do the multiply part of the conversion for even and odd pixels, register usage: mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, @@ -379,43 +372,24 @@ "pxor %%mm4, %%mm4;" /* zero mm4 */ "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ - : : "r" (py), "r" (pu), "r" (pv), "r" (image)); + : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image)); - py += 8; - pu += 4; - pv += 4; - image += 32; - x += 8; - } while (x < h_size); - - if (even) { - pu -= h_size/2; - pv -= h_size/2; - } else { - pu += (uv_stride - h_size/2); - pv += (uv_stride - h_size/2); + _py += 8; + _pu += 4; + _pv += 4; + _image += 32; } - py += (y_stride - h_size); - image += (rgb_stride - 4*h_size); - - /* load data for start of next scan line */ - __asm__ __volatile__ - ( - ".align 8;" - "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ - "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ + if (!even) { + pu += uv_stride; + pv += uv_stride; + } - //"movl $0, (%3);" /* cache preload for image */ - "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ - : : "r" (py), "r" (pu), "r" (pv), "r" (image) - ); + py += y_stride; + image += rgb_stride; - - x = 0; - y += 1; even = (!even); - } while ( y < v_size) ; + } __asm__ __volatile__ (EMMS); }