annotate postproc/yuv2rgb_template.c @ 8763:19e96e60a3d0

Speed optimizations (runs twise as fast) and bugfix (wrong cutoff frequency buffer over run noise and garbeled output when wrong input format)
author anders
date Sat, 04 Jan 2003 06:19:25 +0000
parents e7635c03910f
children 7bbe4bce6293
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
1
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
2 /*
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
3 * yuv2rgb_mmx.c, Software YUV to RGB coverter with Intel MMX "technology"
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
4 *
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
5 * Copyright (C) 2000, Silicon Integrated System Corp.
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
6 * All Rights Reserved.
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
7 *
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
8 * Author: Olie Lho <ollie@sis.com.tw>
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
9 *
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
10 * This file is part of mpeg2dec, a free MPEG-2 video decoder
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
11 *
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
12 * mpeg2dec is free software; you can redistribute it and/or modify
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
13 * it under the terms of the GNU General Public License as published by
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
14 * the Free Software Foundation; either version 2, or (at your option)
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
15 * any later version.
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
16 *
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
17 * mpeg2dec is distributed in the hope that it will be useful,
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
20 * GNU General Public License for more details.
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
21 *
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
22 * You should have received a copy of the GNU General Public License
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
23 * along with GNU Make; see the file COPYING. If not, write to
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
24 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
25 *
2749
2cbecedb2616 15/16 bpp dithering
michael
parents: 2739
diff changeset
26 * 15,24 bpp and dithering from Michael Niedermayer (michaelni@gmx.at)
3143
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
27 * MMX/MMX2 Template stuff from Michael Niedermayer (needed for fast movntq support)
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
28 */
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
29
3143
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
30 #undef MOVNTQ
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
31 #undef EMMS
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
32 #undef SFENCE
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
33
3143
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
34 #ifdef HAVE_3DNOW
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
35 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
36 #define EMMS "femms"
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
37 #else
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
38 #define EMMS "emms"
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
39 #endif
2739
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
40
3143
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
41 #ifdef HAVE_MMX2
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
42 #define MOVNTQ "movntq"
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
43 #define SFENCE "sfence"
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
44 #else
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
45 #define MOVNTQ "movq"
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
46 #define SFENCE "/nop"
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
47 #endif
2739
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
48
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
49 #define YUV2RGB \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
50 /* Do the multiply part of the conversion for even and odd pixels,
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
51 register usage:
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
52 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
53 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
54 mm6 -> Y even, mm7 -> Y odd */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
55 /* convert the chroma part */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
56 "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
57 "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
58 \
4285
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
59 "psubsw "MANGLE(mmx_80w)", %%mm0;" /* Cb -= 128 */ \
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
60 "psubsw "MANGLE(mmx_80w)", %%mm1;" /* Cr -= 128 */ \
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
61 \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
62 "psllw $3, %%mm0;" /* Promote precision */ \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
63 "psllw $3, %%mm1;" /* Promote precision */ \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
64 \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
65 "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
66 "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
67 \
4285
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
68 "pmulhw "MANGLE(mmx_U_green)", %%mm2;" /* Mul Cb with green coeff -> Cb green */ \
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
69 "pmulhw "MANGLE(mmx_V_green)", %%mm3;" /* Mul Cr with green coeff -> Cr green */ \
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
70 \
4285
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
71 "pmulhw "MANGLE(mmx_U_blue)", %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
72 "pmulhw "MANGLE(mmx_V_red)", %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
73 \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
74 "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
75 \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
76 /* convert the luma part */\
4285
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
77 "psubusb "MANGLE(mmx_10w)", %%mm6;" /* Y -= 16 */\
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
78 \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
79 "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
4285
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
80 "pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
81 \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
82 "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
83 \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
84 "psllw $3, %%mm6;" /* Promote precision */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
85 "psllw $3, %%mm7;" /* Promote precision */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
86 \
4285
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
87 "pmulhw "MANGLE(mmx_Y_coeff)", %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
88 "pmulhw "MANGLE(mmx_Y_coeff)", %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
89 \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
90 /* Do the addition part of the conversion for even and odd pixels,
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
91 register usage:
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
92 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
93 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
94 mm6 -> Y even, mm7 -> Y odd */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
95 "movq %%mm0, %%mm3;" /* Copy Cblue */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
96 "movq %%mm1, %%mm4;" /* Copy Cred */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
97 "movq %%mm2, %%mm5;" /* Copy Cgreen */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
98 \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
99 "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
100 "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
101 \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
102 "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
103 "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
104 \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
105 "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
106 "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
107 \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
108 /* Limit RGB even to 0..255 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
109 "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
110 "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
111 "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
112 \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
113 /* Limit RGB odd to 0..255 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
114 "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
115 "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
116 "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
117 \
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
118 /* Interleave RGB even and odd */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
119 "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
120 "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
121 "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
122
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
123
3143
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
124 static inline void RENAME(yuv420_rgb16) (uint8_t * image, uint8_t * py,
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
125 uint8_t * pu, uint8_t * pv,
6492
e7635c03910f sync with mplayer xp
arpi
parents: 5533
diff changeset
126 unsigned h_size, unsigned v_size,
e7635c03910f sync with mplayer xp
arpi
parents: 5533
diff changeset
127 unsigned rgb_stride, unsigned y_stride, unsigned uv_stride)
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
128 {
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
129 int even = 1;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
130 int x, y;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
131
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
132 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
133
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
134 for (y = v_size; --y >= 0; ) {
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
135 uint8_t *_image = image;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
136 uint8_t *_py = py;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
137 uint8_t *_pu = pu;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
138 uint8_t *_pv = pv;
5533
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
139 int internal_h_size= h_size;
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
140 int aligned_h_size= (h_size+7)&~7;
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
141
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
142 if(rgb_stride >= aligned_h_size*2) internal_h_size= aligned_h_size;
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
143
2749
2cbecedb2616 15/16 bpp dithering
michael
parents: 2739
diff changeset
144 b5Dither= dither8[y&1];
2cbecedb2616 15/16 bpp dithering
michael
parents: 2739
diff changeset
145 g6Dither= dither4[y&1];
2cbecedb2616 15/16 bpp dithering
michael
parents: 2739
diff changeset
146 g5Dither= dither8[y&1];
2cbecedb2616 15/16 bpp dithering
michael
parents: 2739
diff changeset
147 r5Dither= dither8[(y+1)&1];
2cbecedb2616 15/16 bpp dithering
michael
parents: 2739
diff changeset
148
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
149 /* load data for start of next scan line */
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
150 __asm__ __volatile__ (
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
151 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
152 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
153 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
154
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
155 : : "r" (_py), "r" (_pu), "r" (_pv));
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
156
5533
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
157 for (x = internal_h_size >> 3; --x >= 0; ) {
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
158 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
159 pixels in each iteration */
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
160
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
161 __asm__ __volatile__ (
2736
aeef41a19f4b 16bpp is 10% faster
michael
parents: 2735
diff changeset
162 /* no speed diference on my p3@500 with prefetch,
aeef41a19f4b 16bpp is 10% faster
michael
parents: 2735
diff changeset
163 * if it is faster for anyone with -benchmark then tell me
aeef41a19f4b 16bpp is 10% faster
michael
parents: 2735
diff changeset
164 PREFETCH" 64(%0) \n\t"
aeef41a19f4b 16bpp is 10% faster
michael
parents: 2735
diff changeset
165 PREFETCH" 64(%1) \n\t"
aeef41a19f4b 16bpp is 10% faster
michael
parents: 2735
diff changeset
166 PREFETCH" 64(%2) \n\t"
aeef41a19f4b 16bpp is 10% faster
michael
parents: 2735
diff changeset
167 */
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
168 YUV2RGB
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
169
2749
2cbecedb2616 15/16 bpp dithering
michael
parents: 2739
diff changeset
170 #ifdef DITHER1XBPP
4285
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
171 "paddusb "MANGLE(b5Dither)", %%mm0;"
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
172 "paddusb "MANGLE(g6Dither)", %%mm2;"
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
173 "paddusb "MANGLE(r5Dither)", %%mm1;"
2749
2cbecedb2616 15/16 bpp dithering
michael
parents: 2739
diff changeset
174 #endif
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
175 /* mask unneeded bits off */
4285
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
176 "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
177 "pand "MANGLE(mmx_grnmask)", %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
178 "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
179
2736
aeef41a19f4b 16bpp is 10% faster
michael
parents: 2735
diff changeset
180 "psrlw $3,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
181 "pxor %%mm4, %%mm4;" /* zero mm4 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
182
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
183 "movq %%mm0, %%mm5;" /* Copy B7-B0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
184 "movq %%mm2, %%mm7;" /* Copy G7-G0 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
185
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
186 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
187 "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
188 "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
189
2736
aeef41a19f4b 16bpp is 10% faster
michael
parents: 2735
diff changeset
190 "psllw $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
191 "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
192
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
193 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
194 MOVNTQ " %%mm0, (%3);" /* store pixel 0-3 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
195
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
196 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
197 "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
198 "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
199
2736
aeef41a19f4b 16bpp is 10% faster
michael
parents: 2735
diff changeset
200 "psllw $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
201 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
202
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
203 "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
204 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
205
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
206 MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
207 : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
208
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
209 _py += 8;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
210 _pu += 4;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
211 _pv += 4;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
212 _image += 16;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
213 }
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
214
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
215 if (!even) {
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
216 pu += uv_stride;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
217 pv += uv_stride;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
218 }
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
219
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
220 py += y_stride;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
221 image += rgb_stride;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
222
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
223 even = (!even);
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
224 }
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
225
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
226 __asm__ __volatile__ (EMMS);
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
227 }
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
228
3143
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
229 static inline void RENAME(yuv420_rgb15) (uint8_t * image, uint8_t * py,
2735
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
230 uint8_t * pu, uint8_t * pv,
6492
e7635c03910f sync with mplayer xp
arpi
parents: 5533
diff changeset
231 unsigned h_size, unsigned v_size,
e7635c03910f sync with mplayer xp
arpi
parents: 5533
diff changeset
232 unsigned rgb_stride, unsigned y_stride, unsigned uv_stride)
2735
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
233 {
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
234 int even = 1;
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
235 int x, y;
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
236
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
237 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
238
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
239 for (y = v_size; --y >= 0; ) {
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
240 uint8_t *_image = image;
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
241 uint8_t *_py = py;
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
242 uint8_t *_pu = pu;
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
243 uint8_t *_pv = pv;
5533
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
244 int internal_h_size= h_size;
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
245 int aligned_h_size= (h_size+7)&~7;
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
246
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
247 if(rgb_stride >= aligned_h_size*2) internal_h_size= aligned_h_size;
2735
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
248
2749
2cbecedb2616 15/16 bpp dithering
michael
parents: 2739
diff changeset
249 b5Dither= dither8[y&1];
2cbecedb2616 15/16 bpp dithering
michael
parents: 2739
diff changeset
250 g6Dither= dither4[y&1];
2cbecedb2616 15/16 bpp dithering
michael
parents: 2739
diff changeset
251 g5Dither= dither8[y&1];
2cbecedb2616 15/16 bpp dithering
michael
parents: 2739
diff changeset
252 r5Dither= dither8[(y+1)&1];
2cbecedb2616 15/16 bpp dithering
michael
parents: 2739
diff changeset
253
2735
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
254 /* load data for start of next scan line */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
255 __asm__ __volatile__ (
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
256 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
257 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
258 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
259
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
260 : : "r" (_py), "r" (_pu), "r" (_pv));
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
261
5533
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
262 for (x = internal_h_size >> 3; --x >= 0; ) {
2735
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
263 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
264 pixels in each iteration */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
265
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
266 __asm__ __volatile__ (
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
267 YUV2RGB
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
268
2749
2cbecedb2616 15/16 bpp dithering
michael
parents: 2739
diff changeset
269 #ifdef DITHER1XBPP
4285
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
270 "paddusb "MANGLE(b5Dither)", %%mm0 \n\t"
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
271 "paddusb "MANGLE(g5Dither)", %%mm2 \n\t"
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
272 "paddusb "MANGLE(r5Dither)", %%mm1 \n\t"
2749
2cbecedb2616 15/16 bpp dithering
michael
parents: 2739
diff changeset
273 #endif
2cbecedb2616 15/16 bpp dithering
michael
parents: 2739
diff changeset
274
2735
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
275 /* mask unneeded bits off */
4285
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
276 "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
277 "pand "MANGLE(mmx_redmask)", %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
278 "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
2735
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
279
2736
aeef41a19f4b 16bpp is 10% faster
michael
parents: 2735
diff changeset
280 "psrlw $3,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
2735
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
281 "psrlw $1,%%mm1;" /* 0_r7r6r5 r4r3_0_0 0_r7r6r5 r4r3_0_0 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
282 "pxor %%mm4, %%mm4;" /* zero mm4 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
283
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
284 "movq %%mm0, %%mm5;" /* Copy B7-B0 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
285 "movq %%mm2, %%mm7;" /* Copy G7-G0 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
286
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
287 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
288 "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
289 "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
290
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
291 "psllw $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
292 "por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
293
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
294 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
295 MOVNTQ " %%mm0, (%3);" /* store pixel 0-3 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
296
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
297 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
298 "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
299 "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
300
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
301 "psllw $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
302 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
303
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
304 "por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
305 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
306
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
307 MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
308 : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
309
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
310 _py += 8;
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
311 _pu += 4;
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
312 _pv += 4;
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
313 _image += 16;
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
314 }
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
315
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
316 if (!even) {
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
317 pu += uv_stride;
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
318 pv += uv_stride;
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
319 }
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
320
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
321 py += y_stride;
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
322 image += rgb_stride;
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
323
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
324 even = (!even);
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
325 }
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
326
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
327 __asm__ __volatile__ (EMMS);
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
328 }
0f4c3ae92857 15 bpp support
michael
parents: 2734
diff changeset
329
3143
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
330 static inline void RENAME(yuv420_rgb24) (uint8_t * image, uint8_t * py,
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
331 uint8_t * pu, uint8_t * pv,
6492
e7635c03910f sync with mplayer xp
arpi
parents: 5533
diff changeset
332 unsigned h_size, unsigned v_size,
e7635c03910f sync with mplayer xp
arpi
parents: 5533
diff changeset
333 unsigned rgb_stride, unsigned y_stride, unsigned uv_stride)
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
334 {
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
335 int even = 1;
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
336 int x, y;
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
337
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
338 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
339
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
340 for (y = v_size; --y >= 0; ) {
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
341 uint8_t *_image = image;
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
342 uint8_t *_py = py;
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
343 uint8_t *_pu = pu;
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
344 uint8_t *_pv = pv;
5533
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
345 int internal_h_size= h_size;
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
346 int aligned_h_size= (h_size+7)&~7;
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
347
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
348 if(rgb_stride >= aligned_h_size*3) internal_h_size= aligned_h_size;
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
349
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
350 /* load data for start of next scan line */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
351 __asm__ __volatile__ (
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
352 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
353 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
354 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
355
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
356 : : "r" (_py), "r" (_pu), "r" (_pv));
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
357
5533
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
358 for (x = internal_h_size >> 3; --x >= 0; ) {
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
359 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
360 pixels in each iteration */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
361
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
362 __asm__ __volatile__ (
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
363 YUV2RGB
2739
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
364 /* mm0=B, %%mm2=G, %%mm1=R */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
365 #ifdef HAVE_MMX2
4285
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
366 "movq "MANGLE(M24A)", %%mm4 \n\t"
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
367 "movq "MANGLE(M24C)", %%mm7 \n\t"
2739
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
368 "pshufw $0x50, %%mm0, %%mm5 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
369 "pshufw $0x50, %%mm2, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
370 "pshufw $0x00, %%mm1, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
371
2739
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
372 "pand %%mm4, %%mm5 \n\t" /* B2 B1 B0 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
373 "pand %%mm4, %%mm3 \n\t" /* G2 G1 G0 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
374 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
375
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
376 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
377 "por %%mm5, %%mm6 \n\t"
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
378 "por %%mm3, %%mm6 \n\t"
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
379 MOVNTQ" %%mm6, (%3) \n\t"
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
380
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
381 "psrlq $8, %%mm2 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
382 "pshufw $0xA5, %%mm0, %%mm5 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
383 "pshufw $0x55, %%mm2, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
384 "pshufw $0xA5, %%mm1, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
385
4285
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
386 "pand "MANGLE(M24B)", %%mm5 \n\t" /* B5 B4 B3 */
2739
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
387 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
388 "pand %%mm4, %%mm6 \n\t" /* R4 R3 R2 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
389
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
390 "por %%mm5, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
391 "por %%mm3, %%mm6 \n\t"
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
392 MOVNTQ" %%mm6, 8(%3) \n\t"
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
393
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
394 "pshufw $0xFF, %%mm0, %%mm5 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
395 "pshufw $0xFA, %%mm2, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
396 "pshufw $0xFA, %%mm1, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
397 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
398
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
399 "pand %%mm7, %%mm5 \n\t" /* B7 B6 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
400 "pand %%mm4, %%mm3 \n\t" /* G7 G6 G5 */
4285
20806e535b96 add mangling
atmos4
parents: 3143
diff changeset
401 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */
2739
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
402 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
403 \
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
404 "por %%mm5, %%mm3 \n\t"
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
405 "por %%mm3, %%mm6 \n\t"
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
406 MOVNTQ" %%mm6, 16(%3) \n\t"
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
407 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
408 "pxor %%mm4, %%mm4 \n\t"
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
409
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
410 #else
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
411
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
412 "pxor %%mm4, %%mm4 \n\t"
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
413 "movq %%mm0, %%mm5 \n\t" /* B */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
414 "movq %%mm1, %%mm6 \n\t" /* R */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
415 "punpcklbw %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
416 "punpcklbw %%mm4, %%mm1 \n\t" /* 0R0R0R0R 0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
417 "punpckhbw %%mm2, %%mm5 \n\t" /* GBGBGBGB 2 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
418 "punpckhbw %%mm4, %%mm6 \n\t" /* 0R0R0R0R 2 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
419 "movq %%mm0, %%mm7 \n\t" /* GBGBGBGB 0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
420 "movq %%mm5, %%mm3 \n\t" /* GBGBGBGB 2 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
421 "punpcklwd %%mm1, %%mm7 \n\t" /* 0RGB0RGB 0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
422 "punpckhwd %%mm1, %%mm0 \n\t" /* 0RGB0RGB 1 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
423 "punpcklwd %%mm6, %%mm5 \n\t" /* 0RGB0RGB 2 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
424 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
425
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
426 "movq %%mm7, %%mm2 \n\t" /* 0RGB0RGB 0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
427 "movq %%mm0, %%mm6 \n\t" /* 0RGB0RGB 1 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
428 "movq %%mm5, %%mm1 \n\t" /* 0RGB0RGB 2 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
429 "movq %%mm3, %%mm4 \n\t" /* 0RGB0RGB 3 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
430
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
431 "psllq $40, %%mm7 \n\t" /* RGB00000 0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
432 "psllq $40, %%mm0 \n\t" /* RGB00000 1 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
433 "psllq $40, %%mm5 \n\t" /* RGB00000 2 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
434 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
435
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
436 "punpckhdq %%mm2, %%mm7 \n\t" /* 0RGBRGB0 0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
437 "punpckhdq %%mm6, %%mm0 \n\t" /* 0RGBRGB0 1 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
438 "punpckhdq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
439 "punpckhdq %%mm4, %%mm3 \n\t" /* 0RGBRGB0 3 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
440
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
441 "psrlq $8, %%mm7 \n\t" /* 00RGBRGB 0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
442 "movq %%mm0, %%mm6 \n\t" /* 0RGBRGB0 1 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
443 "psllq $40, %%mm0 \n\t" /* GB000000 1 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
444 "por %%mm0, %%mm7 \n\t" /* GBRGBRGB 0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
445 MOVNTQ" %%mm7, (%3) \n\t"
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
446
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
447 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
448
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
449 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
450 "movq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
451 "psllq $24, %%mm5 \n\t" /* BRGB0000 2 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
452 "por %%mm5, %%mm6 \n\t" /* BRGBRGBR 1 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
453 MOVNTQ" %%mm6, 8(%3) \n\t"
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
454
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
455 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
456
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
457 "psrlq $40, %%mm1 \n\t" /* 000000RG 2 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
458 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
459 "por %%mm3, %%mm1 \n\t" /* RGBRGBRG 2 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
460 MOVNTQ" %%mm1, 16(%3) \n\t"
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
461
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
462 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
463 "pxor %%mm4, %%mm4 \n\t"
2739
f365024806c0 mmx2 bgr24 stuff from swscale (slightly faster)
michael
parents: 2737
diff changeset
464 #endif
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
465
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
466 : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
467
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
468 _py += 8;
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
469 _pu += 4;
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
470 _pv += 4;
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
471 _image += 24;
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
472 }
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
473
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
474 if (!even) {
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
475 pu += uv_stride;
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
476 pv += uv_stride;
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
477 }
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
478
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
479 py += y_stride;
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
480 image += rgb_stride;
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
481
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
482 even = (!even);
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
483 }
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
484
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
485 __asm__ __volatile__ (EMMS);
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
486 }
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
487
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
488
3143
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
489 static inline void RENAME(yuv420_argb32) (uint8_t * image, uint8_t * py,
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
490 uint8_t * pu, uint8_t * pv,
6492
e7635c03910f sync with mplayer xp
arpi
parents: 5533
diff changeset
491 unsigned h_size, unsigned v_size,
e7635c03910f sync with mplayer xp
arpi
parents: 5533
diff changeset
492 unsigned rgb_stride, unsigned y_stride, unsigned uv_stride)
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
493 {
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
494 int even = 1;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
495 int x, y;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
496
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
497 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
498
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
499 for (y = v_size; --y >= 0; ) {
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
500 uint8_t *_image = image;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
501 uint8_t *_py = py;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
502 uint8_t *_pu = pu;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
503 uint8_t *_pv = pv;
5533
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
504 int internal_h_size= h_size;
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
505 int aligned_h_size= (h_size+7)&~7;
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
506
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
507 if(rgb_stride >= aligned_h_size*4) internal_h_size= aligned_h_size;
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
508
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
509 /* load data for start of next scan line */
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
510 __asm__ __volatile__
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
511 (
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
512 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
513 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
514 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
515 : : "r" (_py), "r" (_pu), "r" (_pv)
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
516 );
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
517
5533
dd78203ea703 fixing black stripe at right if width%8!=0 && stride%8==0 reported by kabi
michael
parents: 4285
diff changeset
518 for (x = internal_h_size >> 3; --x >= 0; ) {
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
519 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
520 pixels in each iteration */
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
521 __asm__ __volatile__ (
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
522 YUV2RGB
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
523 /* convert RGB plane to RGB packed format,
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
524 mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
525 mm4 -> GB, mm5 -> AR pixel 4-7,
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
526 mm6 -> GB, mm7 -> AR pixel 0-3 */
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
527 "pxor %%mm3, %%mm3;" /* zero mm3 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
528
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
529 "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
530 "movq %%mm1, %%mm7;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
531
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
532 "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
533 "movq %%mm1, %%mm5;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
534
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
535 "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
536 "punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
537
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
538 "punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
539 MOVNTQ " %%mm6, (%3);" /* Store ARGB1 ARGB0 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
540
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
541 "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
542 "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
543
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
544 "punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
545 MOVNTQ " %%mm6, 8 (%3);" /* Store ARGB3 ARGB2 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
546
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
547 "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
548 "punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
549
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
550 "punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
551 MOVNTQ " %%mm4, 16 (%3);" /* Store ARGB5 ARGB4 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
552
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
553 "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
554 "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
555
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
556 "punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
557 MOVNTQ " %%mm4, 24 (%3);" /* Store ARGB7 ARGB6 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
558
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
559 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
560 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
561
2734
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
562 "pxor %%mm4, %%mm4;" /* zero mm4 */
4ce165aa0135 common code moved to a #define
michael
parents: 2732
diff changeset
563 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
564
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
565 : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
566
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
567 _py += 8;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
568 _pu += 4;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
569 _pv += 4;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
570 _image += 32;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
571 }
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
572
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
573 if (!even) {
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
574 pu += uv_stride;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
575 pv += uv_stride;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
576 }
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
577
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
578 py += y_stride;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
579 image += rgb_stride;
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
580
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
581 even = (!even);
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
582 }
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
583
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
584 __asm__ __volatile__ (EMMS);
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
585 }
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
586
6492
e7635c03910f sync with mplayer xp
arpi
parents: 5533
diff changeset
587 yuv2rgb_fun RENAME(yuv2rgb_init) (unsigned bpp, int mode)
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
588 {
3143
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
589 if (bpp == 15 && mode == MODE_RGB) return RENAME(yuv420_rgb15);
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
590 if (bpp == 16 && mode == MODE_RGB) return RENAME(yuv420_rgb16);
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
591 if (bpp == 24 && mode == MODE_RGB) return RENAME(yuv420_rgb24);
86910f54c391 runtime cpu detection
michael
parents: 2749
diff changeset
592 if (bpp == 32 && mode == MODE_RGB) return RENAME(yuv420_argb32);
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
593 return NULL; // Fallback to C.
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
594 }
ae79207a3055 Move yuv2rgb to postprocess
nick
parents:
diff changeset
595