comparison libvo/yuv2rgb_mmx.c @ 1101:961f53221ffc

Code cleanup and fix missing config.h and use femms on K6 2/2+/3.
author atmosfear
date Mon, 11 Jun 2001 17:43:15 +0000
parents ed6ac3915d59
children 7ce37211e454
comparison
equal deleted inserted replaced
1100:7829676dc4ef 1101:961f53221ffc
25 * 25 *
26 */ 26 */
27 27
28 #include <stdio.h> 28 #include <stdio.h>
29 #include <stdlib.h> 29 #include <stdlib.h>
30
31 #include "../config.h"
30 32
31 #include "mmx.h" 33 #include "mmx.h"
32 //#include "libmpeg2/mpeg2.h" 34 //#include "libmpeg2/mpeg2.h"
33 //#include "libmpeg2/mpeg2_internal.h" 35 //#include "libmpeg2/mpeg2_internal.h"
34 #include <inttypes.h> 36 #include <inttypes.h>
52 uint64_t mmx_grnmask = 0xfcfcfcfcfcfcfcfc; 54 uint64_t mmx_grnmask = 0xfcfcfcfcfcfcfcfc;
53 uint64_t mmx_grnshift = 0x03; 55 uint64_t mmx_grnshift = 0x03;
54 uint64_t mmx_blueshift = 0x03; 56 uint64_t mmx_blueshift = 0x03;
55 57
56 #ifdef HAVE_MMX2 58 #ifdef HAVE_MMX2
57 #define movntq "movntq" // use this for K7 and p3 only 59 /* use this for K7 and p3 only */
60 #define MOVNTQ "movntq"
58 #else 61 #else
59 #define movntq "movq" // for MMX-only processors 62 /* for MMX-only processors */
63 #define MOVNTQ "movq"
64 #endif
65
66 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
67 /* for K6 2/2+/3 */
68 #define EMMS "femms;"
69 #else
70 #define EMMS "emms;"
60 #endif 71 #endif
61 72
62 static void yuv420_rgb16_mmx (uint8_t * image, uint8_t * py, 73 static void yuv420_rgb16_mmx (uint8_t * image, uint8_t * py,
63 uint8_t * pu, uint8_t * pv, 74 uint8_t * pu, uint8_t * pv,
64 int h_size, int v_size, 75 int h_size, int v_size,
66 { 77 {
67 int even = 1; 78 int even = 1;
68 int x = 0, y = 0; 79 int x = 0, y = 0;
69 80
70 /* load data for first scan line */ 81 /* load data for first scan line */
71 __asm__ ( 82 __asm__ __volatile__ (
72 "movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0\n\t" 83 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
73 "movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0\n\t" 84 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
74 85
75 "pxor %%mm4, %%mm4 # zero mm4\n\t" 86 "pxor %%mm4, %%mm4;" /* zero mm4 */
76 "movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0\n\t" 87 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
77 88
78 //"movl $0, (%3) # cache preload for image\n\t" 89 //"movl $0, (%3);" /* cache preload for image */
79 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); 90 : : "r" (py), "r" (pu), "r" (pv), "r" (image));
80 91
81 do { 92 do {
82 do { 93 do {
83 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 94 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
84 pixels in each iteration */ 95 pixels in each iteration */
85 __asm__ (".align 8 \n\t" 96 __asm__ __volatile__ (".align 8;"
86 /* Do the multiply part of the conversion for even and odd pixels, 97 /* Do the multiply part of the conversion for even and odd pixels,
87 register usage: 98 register usage:
88 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, 99 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
89 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, 100 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
90 mm6 -> Y even, mm7 -> Y odd */ 101 mm6 -> Y even, mm7 -> Y odd */
91 /* convert the chroma part */ 102 /* convert the chroma part */
92 "punpcklbw %%mm4, %%mm0 # scatter 4 Cb 00 u3 00 u2 00 u1 00 u0\n\t" 103 "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */
93 "punpcklbw %%mm4, %%mm1 # scatter 4 Cr 00 v3 00 v2 00 v1 00 v0\n\t" 104 "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */
94 105
95 "psubsw mmx_80w, %%mm0 # Cb -= 128\n\t" 106 "psubsw mmx_80w, %%mm0;" /* Cb -= 128 */
96 "psubsw mmx_80w, %%mm1 # Cr -= 128\n\t" 107 "psubsw mmx_80w, %%mm1;" /* Cr -= 128 */
97 108
98 "psllw $3, %%mm0 # Promote precision\n\t" 109 "psllw $3, %%mm0;" /* Promote precision */
99 "psllw $3, %%mm1 # Promote precision\n\t" 110 "psllw $3, %%mm1;" /* Promote precision */
100 111
101 "movq %%mm0, %%mm2 # Copy 4 Cb 00 u3 00 u2 00 u1 00 u0\n\t" 112 "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */
102 "movq %%mm1, %%mm3 # Copy 4 Cr 00 v3 00 v2 00 v1 00 v0\n\t" 113 "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */
103 114
104 "pmulhw mmx_U_green, %%mm2# Mul Cb with green coeff -> Cb green\n\t" 115 "pmulhw mmx_U_green, %%mm2;" /* Mul Cb with green coeff -> Cb green */
105 "pmulhw mmx_V_green, %%mm3# Mul Cr with green coeff -> Cr green\n\t" 116 "pmulhw mmx_V_green, %%mm3;" /* Mul Cr with green coeff -> Cr green */
106 117
107 "pmulhw mmx_U_blue, %%mm0 # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0\n\t" 118 "pmulhw mmx_U_blue, %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */
108 "pmulhw mmx_V_red, %%mm1 # Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0\n\t" 119 "pmulhw mmx_V_red, %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */
109 120
110 "paddsw %%mm3, %%mm2 # Cb green + Cr green -> Cgreen\n\t" 121 "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */
111 122
112 /* convert the luma part */ 123 /* convert the luma part */
113 "psubusb mmx_10w, %%mm6 # Y -= 16\n\t" 124 "psubusb mmx_10w, %%mm6;" /* Y -= 16 */
114 125
115 "movq %%mm6, %%mm7 # Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0\n\t" 126 "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
116 "pand mmx_00ffw, %%mm6 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0\n\t" 127 "pand mmx_00ffw, %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */
117 128
118 "psrlw $8, %%mm7 # get Y odd 00 Y7 00 Y5 00 Y3 00 Y1\n\t" 129 "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */
119 130
120 "psllw $3, %%mm6 # Promote precision\n\t" 131 "psllw $3, %%mm6;" /* Promote precision */
121 "psllw $3, %%mm7 # Promote precision\n\t" 132 "psllw $3, %%mm7;" /* Promote precision */
122 133
123 "pmulhw mmx_Y_coeff, %%mm6# Mul 4 Y even 00 y6 00 y4 00 y2 00 y0\n\t" 134 "pmulhw mmx_Y_coeff, %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */
124 "pmulhw mmx_Y_coeff, %%mm7# Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1\n\t" 135 "pmulhw mmx_Y_coeff, %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */
125 136
126 /* Do the addition part of the conversion for even and odd pixels, 137 /* Do the addition part of the conversion for even and odd pixels,
127 register usage: 138 register usage:
128 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, 139 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
129 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, 140 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
130 mm6 -> Y even, mm7 -> Y odd */ 141 mm6 -> Y even, mm7 -> Y odd */
131 "movq %%mm0, %%mm3 # Copy Cblue\n\t" 142 "movq %%mm0, %%mm3;" /* Copy Cblue */
132 "movq %%mm1, %%mm4 # Copy Cred\n\t" 143 "movq %%mm1, %%mm4;" /* Copy Cred */
133 "movq %%mm2, %%mm5 # Copy Cgreen\n\t" 144 "movq %%mm2, %%mm5;" /* Copy Cgreen */
134 145
135 "paddsw %%mm6, %%mm0 # Y even + Cblue 00 B6 00 B4 00 B2 00 B0\n\t" 146 "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */
136 "paddsw %%mm7, %%mm3 # Y odd + Cblue 00 B7 00 B5 00 B3 00 B1\n\t" 147 "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */
137 148
138 "paddsw %%mm6, %%mm1 # Y even + Cred 00 R6 00 R4 00 R2 00 R0\n\t" 149 "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */
139 "paddsw %%mm7, %%mm4 # Y odd + Cred 00 R7 00 R5 00 R3 00 R1\n\t" 150 "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */
140 151
141 "paddsw %%mm6, %%mm2 # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0\n\t" 152 "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */
142 "paddsw %%mm7, %%mm5 # Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1\n\t" 153 "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */
143 154
144 /* Limit RGB even to 0..255 */ 155 /* Limit RGB even to 0..255 */
145 "packuswb %%mm0, %%mm0 # B6 B4 B2 B0 B6 B4 B2 B0\n\t" 156 "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */
146 "packuswb %%mm1, %%mm1 # R6 R4 R2 R0 R6 R4 R2 R0\n\t" 157 "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */
147 "packuswb %%mm2, %%mm2 # G6 G4 G2 G0 G6 G4 G2 G0\n\t" 158 "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */
148 159
149 /* Limit RGB odd to 0..255 */ 160 /* Limit RGB odd to 0..255 */
150 "packuswb %%mm3, %%mm3 # B7 B5 B3 B1 B7 B5 B3 B1\n\t" 161 "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */
151 "packuswb %%mm4, %%mm4 # R7 R5 R3 R1 R7 R5 R3 R1\n\t" 162 "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */
152 "packuswb %%mm5, %%mm5 # G7 G5 G3 G1 G7 G5 G3 G1\n\t" 163 "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */
153 164
154 /* Interleave RGB even and odd */ 165 /* Interleave RGB even and odd */
155 "punpcklbw %%mm3, %%mm0 # B7 B6 B5 B4 B3 B2 B1 B0\n\t" 166 "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
156 "punpcklbw %%mm4, %%mm1 # R7 R6 R5 R4 R3 R2 R1 R0\n\t" 167 "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
157 "punpcklbw %%mm5, %%mm2 # G7 G6 G5 G4 G3 G2 G1 G0\n\t" 168 "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */
158 169
159 /* mask unneeded bits off */ 170 /* mask unneeded bits off */
160 "pand mmx_redmask, %%mm0# b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0\n\t" 171 "pand mmx_redmask, %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
161 "pand mmx_grnmask, %%mm2# g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0\n\t" 172 "pand mmx_grnmask, %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */
162 "pand mmx_redmask, %%mm1# r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0\n\t" 173 "pand mmx_redmask, %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
163 174
164 "psrlw mmx_blueshift,%%mm0#0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3\n\t" 175 "psrlw mmx_blueshift,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
165 "pxor %%mm4, %%mm4 # zero mm4\n\t" 176 "pxor %%mm4, %%mm4;" /* zero mm4 */
166 177
167 "movq %%mm0, %%mm5 # Copy B7-B0\n\t" 178 "movq %%mm0, %%mm5;" /* Copy B7-B0 */
168 "movq %%mm2, %%mm7 # Copy G7-G0\n\t" 179 "movq %%mm2, %%mm7;" /* Copy G7-G0 */
169 180
170 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ 181 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
171 "punpcklbw %%mm4, %%mm2 # 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0\n\t" 182 "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
172 "punpcklbw %%mm1, %%mm0 # r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3\n\t" 183 "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
173 184
174 "psllw mmx_blueshift,%%mm2# 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0\n\t" 185 "psllw mmx_blueshift,%%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
175 "por %%mm2, %%mm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3\n\t" 186 "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
176 187
177 "movq 8 (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0\n\t" 188 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
178 movntq " %%mm0, (%3) # store pixel 0-3\n\t" 189 MOVNTQ " %%mm0, (%3);" /* store pixel 0-3 */
179 190
180 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ 191 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
181 "punpckhbw %%mm4, %%mm7 # 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0\n\t" 192 "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
182 "punpckhbw %%mm1, %%mm5 # r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3\n\t" 193 "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
183 194
184 "psllw mmx_blueshift,%%mm7# 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0\n\t" 195 "psllw mmx_blueshift,%%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
185 "movd 4 (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0\n\t" 196 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
186 197
187 "por %%mm7, %%mm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3\n\t" 198 "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
188 "movd 4 (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0\n\t" 199 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
189 200
190 movntq " %%mm5, 8 (%3) # store pixel 4-7\n\t" 201 MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */
191 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); 202 : : "r" (py), "r" (pu), "r" (pv), "r" (image));
192 203
193 py += 8; 204 py += 8;
194 pu += 4; 205 pu += 4;
195 pv += 4; 206 pv += 4;
207 218
208 py += (y_stride - h_size); 219 py += (y_stride - h_size);
209 image += (rgb_stride - 2*h_size); 220 image += (rgb_stride - 2*h_size);
210 221
211 /* load data for start of next scan line */ 222 /* load data for start of next scan line */
212 __asm__ ( 223 __asm__ __volatile__ (
213 "movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 00 u3 u2 u1 u0\n\t" 224 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 00 u3 u2 u1 u0 */
214 "movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 00 v2 v1 v0\n\t" 225 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 00 v2 v1 v0 */
215 226
216 //"movl $0, (%3) # cache preload for image\n\t" 227 //"movl $0, (%3);" /* cache preload for image */
217 "movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0\n\t" 228 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
218 229
219 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); 230 : : "r" (py), "r" (pu), "r" (pv), "r" (image));
220 231
221 x = 0; 232 x = 0;
222 y += 1; 233 y += 1;
223 even = (!even); 234 even = (!even);
224 } while (y < v_size) ; 235 } while (y < v_size) ;
225 236
226 __asm__ ("emms\n\t"); 237 __asm__ __volatile__ (EMMS);
227 } 238 }
228 239
229 static void yuv420_argb32_mmx (uint8_t * image, uint8_t * py, 240 static void yuv420_argb32_mmx (uint8_t * image, uint8_t * py,
230 uint8_t * pu, uint8_t * pv, 241 uint8_t * pu, uint8_t * pv,
231 int h_size, int v_size, 242 int h_size, int v_size,
232 int rgb_stride, int y_stride, int uv_stride) 243 int rgb_stride, int y_stride, int uv_stride)
233 { 244 {
234 int even = 1; 245 int even = 1;
235 int x = 0, y = 0; 246 int x = 0, y = 0;
236 247
237 __asm__ ( 248 __asm__ __volatile__ (
238 ".align 8 \n\t" 249 ".align 8;"
239 "movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0\n\t" 250 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
240 //"movl $0, (%3) # cache preload for image\n\t" 251 //"movl $0, (%3);" /* cache preload for image */
241 252
242 "movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0\n\t" 253 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
243 "pxor %%mm4, %%mm4 # zero mm4\n\t" 254 "pxor %%mm4, %%mm4;" /* zero mm4 */
244 255
245 "movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0\n\t" 256 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
246 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); 257 : : "r" (py), "r" (pu), "r" (pv), "r" (image));
247 258
248 do { 259 do {
249 do { 260 do {
250 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 261 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
251 pixels in each iteration */ 262 pixels in each iteration */
252 __asm__ ( 263 __asm__ __volatile__ (
253 ".align 8 \n\t" 264 ".align 8;"
254 /* Do the multiply part of the conversion for even and odd pixels, 265 /* Do the multiply part of the conversion for even and odd pixels,
255 register usage: 266 register usage:
256 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, 267 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
257 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, 268 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
258 mm6 -> Y even, mm7 -> Y odd */ 269 mm6 -> Y even, mm7 -> Y odd */
259 270
260 /* convert the chroma part */ 271 /* convert the chroma part */
261 "punpcklbw %%mm4, %%mm0 # scatter 4 Cb 00 u3 00 u2 00 u1 00 u0\n\t" 272 "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */
262 "punpcklbw %%mm4, %%mm1 # scatter 4 Cr 00 v3 00 v2 00 v1 00 v0\n\t" 273 "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */
263 274
264 "psubsw mmx_80w, %%mm0 # Cb -= 128\n\t" 275 "psubsw mmx_80w, %%mm0;" /* Cb -= 128 */
265 "psubsw mmx_80w, %%mm1 # Cr -= 128\n\t" 276 "psubsw mmx_80w, %%mm1;" /* Cr -= 128 */
266 277
267 "psllw $3, %%mm0 # Promote precision\n\t" 278 "psllw $3, %%mm0;" /* Promote precision */
268 "psllw $3, %%mm1 # Promote precision\n\t" 279 "psllw $3, %%mm1;" /* Promote precision */
269 280
270 "movq %%mm0, %%mm2 # Copy 4 Cb 00 u3 00 u2 00 u1 00 u0\n\t" 281 "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */
271 "movq %%mm1, %%mm3 # Copy 4 Cr 00 v3 00 v2 00 v1 00 v0\n\t" 282 "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */
272 283
273 "pmulhw mmx_U_green, %%mm2# Mul Cb with green coeff -> Cb green\n\t" 284 "pmulhw mmx_U_green, %%mm2;" /* Mul Cb with green coeff -> Cb green */
274 "pmulhw mmx_V_green, %%mm3# Mul Cr with green coeff -> Cr green\n\t" 285 "pmulhw mmx_V_green, %%mm3;" /* Mul Cr with green coeff -> Cr green */
275 286
276 "pmulhw mmx_U_blue, %%mm0 # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0\n\t" 287 "pmulhw mmx_U_blue, %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */
277 "pmulhw mmx_V_red, %%mm1 # Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0\n\t" 288 "pmulhw mmx_V_red, %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */
278 289
279 "paddsw %%mm3, %%mm2 # Cb green + Cr green -> Cgreen\n\t" 290 "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */
280 291
281 /* convert the luma part */ 292 /* convert the luma part */
282 "psubusb mmx_10w, %%mm6 # Y -= 16\n\t" 293 "psubusb mmx_10w, %%mm6;" /* Y -= 16 */
283 294
284 "movq %%mm6, %%mm7 # Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0\n\t" 295 "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
285 "pand mmx_00ffw, %%mm6 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0\n\t" 296 "pand mmx_00ffw, %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */
286 297
287 "psrlw $8, %%mm7 # get Y odd 00 Y7 00 Y5 00 Y3 00 Y1\n\t" 298 "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */
288 299
289 "psllw $3, %%mm6 # Promote precision\n\t" 300 "psllw $3, %%mm6;" /* Promote precision */
290 "psllw $3, %%mm7 # Promote precision\n\t" 301 "psllw $3, %%mm7;" /* Promote precision */
291 302
292 "pmulhw mmx_Y_coeff, %%mm6# Mul 4 Y even 00 y6 00 y4 00 y2 00 y0\n\t" 303 "pmulhw mmx_Y_coeff, %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */
293 "pmulhw mmx_Y_coeff, %%mm7# Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1\n\t" 304 "pmulhw mmx_Y_coeff, %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */
294 305
295 /* Do the addition part of the conversion for even and odd pixels, 306 /* Do the addition part of the conversion for even and odd pixels,
296 register usage: 307 register usage:
297 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, 308 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
298 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, 309 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
299 mm6 -> Y even, mm7 -> Y odd */ 310 mm6 -> Y even, mm7 -> Y odd */
300 311
301 "movq %%mm0, %%mm3 # Copy Cblue\n\t" 312 "movq %%mm0, %%mm3;" /* Copy Cblue */
302 "movq %%mm1, %%mm4 # Copy Cred\n\t" 313 "movq %%mm1, %%mm4;" /* Copy Cred */
303 "movq %%mm2, %%mm5 # Copy Cgreen\n\t" 314 "movq %%mm2, %%mm5;" /* Copy Cgreen */
304 315
305 "paddsw %%mm6, %%mm0 # Y even + Cblue 00 B6 00 B4 00 B2 00 B0\n\t" 316 "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */
306 "paddsw %%mm7, %%mm3 # Y odd + Cblue 00 B7 00 B5 00 B3 00 B1\n\t" 317 "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */
307 318
308 "paddsw %%mm6, %%mm1 # Y even + Cred 00 R6 00 R4 00 R2 00 R0\n\t" 319 "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */
309 "paddsw %%mm7, %%mm4 # Y odd + Cred 00 R7 00 R5 00 R3 00 R1\n\t" 320 "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */
310 321
311 "paddsw %%mm6, %%mm2 # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0\n\t" 322 "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */
312 "paddsw %%mm7, %%mm5 # Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1\n\t" 323 "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */
313 324
314 /* Limit RGB even to 0..255 */ 325 /* Limit RGB even to 0..255 */
315 "packuswb %%mm0, %%mm0 # B6 B4 B2 B0 B6 B4 B2 B0\n\t" 326 "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */
316 "packuswb %%mm1, %%mm1 # R6 R4 R2 R0 R6 R4 R2 R0\n\t" 327 "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */
317 "packuswb %%mm2, %%mm2 # G6 G4 G2 G0 G6 G4 G2 G0\n\t" 328 "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */
318 329
319 /* Limit RGB odd to 0..255 */ 330 /* Limit RGB odd to 0..255 */
320 "packuswb %%mm3, %%mm3 # B7 B5 B3 B1 B7 B5 B3 B1\n\t" 331 "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */
321 "packuswb %%mm4, %%mm4 # R7 R5 R3 R1 R7 R5 R3 R1\n\t" 332 "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */
322 "packuswb %%mm5, %%mm5 # G7 G5 G3 G1 G7 G5 G3 G1\n\t" 333 "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */
323 334
324 /* Interleave RGB even and odd */ 335 /* Interleave RGB even and odd */
325 "punpcklbw %%mm3, %%mm0 # B7 B6 B5 B4 B3 B2 B1 B0\n\t" 336 "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
326 "punpcklbw %%mm4, %%mm1 # R7 R6 R5 R4 R3 R2 R1 R0\n\t" 337 "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
327 "punpcklbw %%mm5, %%mm2 # G7 G6 G5 G4 G3 G2 G1 G0\n\t" 338 "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */
328 339
329 /* convert RGB plane to RGB packed format, 340 /* convert RGB plane to RGB packed format,
330 mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0, 341 mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
331 mm4 -> GB, mm5 -> AR pixel 4-7, 342 mm4 -> GB, mm5 -> AR pixel 4-7,
332 mm6 -> GB, mm7 -> AR pixel 0-3 */ 343 mm6 -> GB, mm7 -> AR pixel 0-3 */
333 "pxor %%mm3, %%mm3 # zero mm3\n\t" 344 "pxor %%mm3, %%mm3;" /* zero mm3 */
334 345
335 "movq %%mm0, %%mm6 # B7 B6 B5 B4 B3 B2 B1 B0\n\t" 346 "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
336 "movq %%mm1, %%mm7 # R7 R6 R5 R4 R3 R2 R1 R0\n\t" 347 "movq %%mm1, %%mm7;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
337 348
338 "movq %%mm0, %%mm4 # B7 B6 B5 B4 B3 B2 B1 B0\n\t" 349 "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
339 "movq %%mm1, %%mm5 # R7 R6 R5 R4 R3 R2 R1 R0\n\t" 350 "movq %%mm1, %%mm5;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
340 351
341 "punpcklbw %%mm2, %%mm6 # G3 B3 G2 B2 G1 B1 G0 B0\n\t" 352 "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
342 "punpcklbw %%mm3, %%mm7 # 00 R3 00 R2 00 R1 00 R0\n\t" 353 "punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */
343 354
344 "punpcklwd %%mm7, %%mm6 # 00 R1 B1 G1 00 R0 B0 G0\n\t" 355 "punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */
345 movntq " %%mm6, (%3) # Store ARGB1 ARGB0\n\t" 356 MOVNTQ " %%mm6, (%3);" /* Store ARGB1 ARGB0 */
346 357
347 "movq %%mm0, %%mm6 # B7 B6 B5 B4 B3 B2 B1 B0\n\t" 358 "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
348 "punpcklbw %%mm2, %%mm6 # G3 B3 G2 B2 G1 B1 G0 B0\n\t" 359 "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
349 360
350 "punpckhwd %%mm7, %%mm6 # 00 R3 G3 B3 00 R2 B3 G2\n\t" 361 "punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */
351 movntq " %%mm6, 8 (%3) # Store ARGB3 ARGB2\n\t" 362 MOVNTQ " %%mm6, 8 (%3);" /* Store ARGB3 ARGB2 */
352 363
353 "punpckhbw %%mm2, %%mm4 # G7 B7 G6 B6 G5 B5 G4 B4\n\t" 364 "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
354 "punpckhbw %%mm3, %%mm5 # 00 R7 00 R6 00 R5 00 R4\n\t" 365 "punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */
355 366
356 "punpcklwd %%mm5, %%mm4 # 00 R5 B5 G5 00 R4 B4 G4\n\t" 367 "punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */
357 movntq " %%mm4, 16 (%3) # Store ARGB5 ARGB4\n\t" 368 MOVNTQ " %%mm4, 16 (%3);" /* Store ARGB5 ARGB4 */
358 369
359 "movq %%mm0, %%mm4 # B7 B6 B5 B4 B3 B2 B1 B0\n\t" 370 "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
360 "punpckhbw %%mm2, %%mm4 # G7 B7 G6 B6 G5 B5 G4 B4\n\t" 371 "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
361 372
362 "punpckhwd %%mm5, %%mm4 # 00 R7 G7 B7 00 R6 B6 G6\n\t" 373 "punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */
363 movntq " %%mm4, 24 (%3) # Store ARGB7 ARGB6\n\t" 374 MOVNTQ " %%mm4, 24 (%3);" /* Store ARGB7 ARGB6 */
364 375
365 "movd 4 (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0\n\t" 376 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
366 "movd 4 (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0\n\t" 377 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
367 378
368 "pxor %%mm4, %%mm4 # zero mm4\n\t" 379 "pxor %%mm4, %%mm4;" /* zero mm4 */
369 "movq 8 (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0\n\t" 380 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
370 381
371 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); 382 : : "r" (py), "r" (pu), "r" (pv), "r" (image));
372 383
373 py += 8; 384 py += 8;
374 pu += 4; 385 pu += 4;
387 398
388 py += (y_stride - h_size); 399 py += (y_stride - h_size);
389 image += (rgb_stride - 4*h_size); 400 image += (rgb_stride - 4*h_size);
390 401
391 /* load data for start of next scan line */ 402 /* load data for start of next scan line */
392 __asm__ 403 __asm__ __volatile__
393 ( 404 (
394 ".align 8 \n\t" 405 ".align 8;"
395 "movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0\n\t" 406 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
396 "movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0\n\t" 407 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
397 408
398 //"movl $0, (%3) # cache preload for image\n\t" 409 //"movl $0, (%3);" /* cache preload for image */
399 "movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0\n\t" 410 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
400 : : "r" (py), "r" (pu), "r" (pv), "r" (image) 411 : : "r" (py), "r" (pu), "r" (pv), "r" (image)
401 ); 412 );
402 413
403 414
404 x = 0; 415 x = 0;
405 y += 1; 416 y += 1;
406 even = (!even); 417 even = (!even);
407 } while ( y < v_size) ; 418 } while ( y < v_size) ;
408 419
409 __asm__ ("emms\n\t"); 420 __asm__ __volatile__ (EMMS);
410 } 421 }
411 422
412 yuv2rgb_fun yuv2rgb_init_mmx (int bpp, int mode) 423 yuv2rgb_fun yuv2rgb_init_mmx (int bpp, int mode)
413 { 424 {
414 // if (bpp == 15 || bpp == 16) { 425 // if (bpp == 15 || bpp == 16) {