comparison libvo/yuv2rgb_mmx.c @ 1306:7ce37211e454

yuv2rgb_mmx crashes with ffdivx codec, when we play back avi files that have a frame width that is not an exact multiple of 8. Testcase: 405.avi (356x240). Playing on an MMX capable x86 system using the x11 video-out driver results in a segfault. The MMX routines convert image data in quantities of 8 pixels in each loop, and the inner loop was not terminated in case there are only 1-7 pixels left, producing too much RGB output. For now, just ignore the last few pixels on each row, to avoid the segfaults. (Gives a black vertical border on the right, if you play a video with width%8 != 0) A possible future enhancement would be, to add a second loop to convert the last width%8 pixels to RGB using a byte loop.
author jkeil
date Thu, 12 Jul 2001 15:23:26 +0000
parents 961f53221ffc
children ae2026ac39d4
comparison
equal deleted inserted replaced
1305:0a8237e28ce0 1306:7ce37211e454
74 uint8_t * pu, uint8_t * pv, 74 uint8_t * pu, uint8_t * pv,
75 int h_size, int v_size, 75 int h_size, int v_size,
76 int rgb_stride, int y_stride, int uv_stride) 76 int rgb_stride, int y_stride, int uv_stride)
77 { 77 {
78 int even = 1; 78 int even = 1;
79 int x = 0, y = 0; 79 int x, y;
80 80
81 /* load data for first scan line */ 81 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
82 __asm__ __volatile__ ( 82
83 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ 83 for (y = v_size; --y >= 0; ) {
84 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ 84 uint8_t *_image = image;
85 85 uint8_t *_py = py;
86 "pxor %%mm4, %%mm4;" /* zero mm4 */ 86 uint8_t *_pu = pu;
87 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ 87 uint8_t *_pv = pv;
88 88
89 //"movl $0, (%3);" /* cache preload for image */ 89 /* load data for start of next scan line */
90 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); 90 __asm__ __volatile__ (
91 91 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
92 do { 92 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
93 do { 93 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
94
95 : : "r" (_py), "r" (_pu), "r" (_pv));
96
97 for (x = h_size >> 3; --x >= 0; ) {
94 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 98 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
95 pixels in each iteration */ 99 pixels in each iteration */
96 __asm__ __volatile__ (".align 8;" 100
101 __asm__ __volatile__ (
97 /* Do the multiply part of the conversion for even and odd pixels, 102 /* Do the multiply part of the conversion for even and odd pixels,
98 register usage: 103 register usage:
99 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, 104 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
100 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, 105 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
101 mm6 -> Y even, mm7 -> Y odd */ 106 mm6 -> Y even, mm7 -> Y odd */
197 202
198 "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ 203 "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
199 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ 204 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
200 205
201 MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */ 206 MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */
202 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); 207 : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
203 208
204 py += 8; 209 _py += 8;
205 pu += 4; 210 _pu += 4;
206 pv += 4; 211 _pv += 4;
207 image += 16; 212 _image += 16;
208 x += 8;
209 } while (x < h_size);
210
211 if (even) {
212 pu -= h_size/2;
213 pv -= h_size/2;
214 } else {
215 pu += (uv_stride - h_size/2);
216 pv += (uv_stride - h_size/2);
217 } 213 }
218 214
219 py += (y_stride - h_size); 215 if (!even) {
220 image += (rgb_stride - 2*h_size); 216 pu += uv_stride;
221 217 pv += uv_stride;
222 /* load data for start of next scan line */ 218 }
223 __asm__ __volatile__ ( 219
224 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 00 u3 u2 u1 u0 */ 220 py += y_stride;
225 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 00 v2 v1 v0 */ 221 image += rgb_stride;
226 222
227 //"movl $0, (%3);" /* cache preload for image */
228 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
229
230 : : "r" (py), "r" (pu), "r" (pv), "r" (image));
231
232 x = 0;
233 y += 1;
234 even = (!even); 223 even = (!even);
235 } while (y < v_size) ; 224 }
236 225
237 __asm__ __volatile__ (EMMS); 226 __asm__ __volatile__ (EMMS);
238 } 227 }
239 228
240 static void yuv420_argb32_mmx (uint8_t * image, uint8_t * py, 229 static void yuv420_argb32_mmx (uint8_t * image, uint8_t * py,
241 uint8_t * pu, uint8_t * pv, 230 uint8_t * pu, uint8_t * pv,
242 int h_size, int v_size, 231 int h_size, int v_size,
243 int rgb_stride, int y_stride, int uv_stride) 232 int rgb_stride, int y_stride, int uv_stride)
244 { 233 {
245 int even = 1; 234 int even = 1;
246 int x = 0, y = 0; 235 int x, y;
247 236
248 __asm__ __volatile__ ( 237 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
249 ".align 8;" 238
239 for (y = v_size; --y >= 0; ) {
240 uint8_t *_image = image;
241 uint8_t *_py = py;
242 uint8_t *_pu = pu;
243 uint8_t *_pv = pv;
244
245 /* load data for start of next scan line */
246 __asm__ __volatile__
247 (
250 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ 248 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
251 //"movl $0, (%3);" /* cache preload for image */
252
253 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ 249 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
254 "pxor %%mm4, %%mm4;" /* zero mm4 */ 250 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
255 251 : : "r" (_py), "r" (_pu), "r" (_pv)
256 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ 252 );
257 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); 253
258 254 for (x = h_size >> 3; --x >= 0; ) {
259 do {
260 do {
261 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 255 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
262 pixels in each iteration */ 256 pixels in each iteration */
263 __asm__ __volatile__ ( 257 __asm__ __volatile__ (
264 ".align 8;"
265 /* Do the multiply part of the conversion for even and odd pixels, 258 /* Do the multiply part of the conversion for even and odd pixels,
266 register usage: 259 register usage:
267 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, 260 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
268 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, 261 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
269 mm6 -> Y even, mm7 -> Y odd */ 262 mm6 -> Y even, mm7 -> Y odd */
377 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ 370 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
378 371
379 "pxor %%mm4, %%mm4;" /* zero mm4 */ 372 "pxor %%mm4, %%mm4;" /* zero mm4 */
380 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ 373 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
381 374
382 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); 375 : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
383 376
384 py += 8; 377 _py += 8;
385 pu += 4; 378 _pu += 4;
386 pv += 4; 379 _pv += 4;
387 image += 32; 380 _image += 32;
388 x += 8;
389 } while (x < h_size);
390
391 if (even) {
392 pu -= h_size/2;
393 pv -= h_size/2;
394 } else {
395 pu += (uv_stride - h_size/2);
396 pv += (uv_stride - h_size/2);
397 } 381 }
398 382
399 py += (y_stride - h_size); 383 if (!even) {
400 image += (rgb_stride - 4*h_size); 384 pu += uv_stride;
401 385 pv += uv_stride;
402 /* load data for start of next scan line */ 386 }
403 __asm__ __volatile__ 387
404 ( 388 py += y_stride;
405 ".align 8;" 389 image += rgb_stride;
406 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ 390
407 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
408
409 //"movl $0, (%3);" /* cache preload for image */
410 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
411 : : "r" (py), "r" (pu), "r" (pv), "r" (image)
412 );
413
414
415 x = 0;
416 y += 1;
417 even = (!even); 391 even = (!even);
418 } while ( y < v_size) ; 392 }
419 393
420 __asm__ __volatile__ (EMMS); 394 __asm__ __volatile__ (EMMS);
421 } 395 }
422 396
423 yuv2rgb_fun yuv2rgb_init_mmx (int bpp, int mode) 397 yuv2rgb_fun yuv2rgb_init_mmx (int bpp, int mode)