2732
|
1
|
|
2 /*
|
|
3 * yuv2rgb_mmx.c, Software YUV to RGB coverter with Intel MMX "technology"
|
|
4 *
|
|
5 * Copyright (C) 2000, Silicon Integrated System Corp.
|
|
6 * All Rights Reserved.
|
|
7 *
|
|
8 * Author: Olie Lho <ollie@sis.com.tw>
|
|
9 *
|
|
10 * This file is part of mpeg2dec, a free MPEG-2 video decoder
|
|
11 *
|
|
12 * mpeg2dec is free software; you can redistribute it and/or modify
|
|
13 * it under the terms of the GNU General Public License as published by
|
|
14 * the Free Software Foundation; either version 2, or (at your option)
|
|
15 * any later version.
|
|
16 *
|
|
17 * mpeg2dec is distributed in the hope that it will be useful,
|
|
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
20 * GNU General Public License for more details.
|
|
21 *
|
|
22 * You should have received a copy of the GNU General Public License
|
|
23 * along with GNU Make; see the file COPYING. If not, write to
|
|
24 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
25 *
|
2749
|
26 * 15,24 bpp and dithering from Michael Niedermayer (michaelni@gmx.at)
|
3143
|
27 * MMX/MMX2 Template stuff from Michael Niedermayer (needed for fast movntq support)
|
2732
|
28 */
|
|
29
|
3143
|
30 #undef MOVNTQ
|
|
31 #undef EMMS
|
|
32 #undef SFENCE
|
2732
|
33
|
3143
|
34 #ifdef HAVE_3DNOW
|
|
35 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
|
|
36 #define EMMS "femms"
|
|
37 #else
|
|
38 #define EMMS "emms"
|
|
39 #endif
|
2739
|
40
|
3143
|
41 #ifdef HAVE_MMX2
|
|
42 #define MOVNTQ "movntq"
|
|
43 #define SFENCE "sfence"
|
|
44 #else
|
|
45 #define MOVNTQ "movq"
|
|
46 #define SFENCE "/nop"
|
|
47 #endif
|
2739
|
48
|
2734
|
49 #define YUV2RGB \
|
|
50 /* Do the multiply part of the conversion for even and odd pixels,
|
|
51 register usage:
|
|
52 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
|
|
53 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
|
|
54 mm6 -> Y even, mm7 -> Y odd */\
|
|
55 /* convert the chroma part */\
|
|
56 "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
|
|
57 "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
|
|
58 \
|
|
59 "psubsw mmx_80w, %%mm0;" /* Cb -= 128 */ \
|
|
60 "psubsw mmx_80w, %%mm1;" /* Cr -= 128 */ \
|
|
61 \
|
|
62 "psllw $3, %%mm0;" /* Promote precision */ \
|
|
63 "psllw $3, %%mm1;" /* Promote precision */ \
|
|
64 \
|
|
65 "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
|
|
66 "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
|
|
67 \
|
|
68 "pmulhw mmx_U_green, %%mm2;" /* Mul Cb with green coeff -> Cb green */ \
|
|
69 "pmulhw mmx_V_green, %%mm3;" /* Mul Cr with green coeff -> Cr green */ \
|
|
70 \
|
|
71 "pmulhw mmx_U_blue, %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\
|
|
72 "pmulhw mmx_V_red, %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\
|
|
73 \
|
|
74 "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\
|
|
75 \
|
|
76 /* convert the luma part */\
|
|
77 "psubusb mmx_10w, %%mm6;" /* Y -= 16 */\
|
|
78 \
|
|
79 "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
|
|
80 "pand mmx_00ffw, %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\
|
|
81 \
|
|
82 "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\
|
|
83 \
|
|
84 "psllw $3, %%mm6;" /* Promote precision */\
|
|
85 "psllw $3, %%mm7;" /* Promote precision */\
|
|
86 \
|
|
87 "pmulhw mmx_Y_coeff, %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\
|
|
88 "pmulhw mmx_Y_coeff, %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\
|
|
89 \
|
|
90 /* Do the addition part of the conversion for even and odd pixels,
|
|
91 register usage:
|
|
92 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
|
|
93 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
|
|
94 mm6 -> Y even, mm7 -> Y odd */\
|
|
95 "movq %%mm0, %%mm3;" /* Copy Cblue */\
|
|
96 "movq %%mm1, %%mm4;" /* Copy Cred */\
|
|
97 "movq %%mm2, %%mm5;" /* Copy Cgreen */\
|
|
98 \
|
|
99 "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\
|
|
100 "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\
|
|
101 \
|
|
102 "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\
|
|
103 "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\
|
|
104 \
|
|
105 "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\
|
|
106 "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\
|
|
107 \
|
|
108 /* Limit RGB even to 0..255 */\
|
|
109 "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */\
|
|
110 "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */\
|
|
111 "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */\
|
|
112 \
|
|
113 /* Limit RGB odd to 0..255 */\
|
|
114 "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */\
|
|
115 "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */\
|
|
116 "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */\
|
|
117 \
|
|
118 /* Interleave RGB even and odd */\
|
|
119 "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\
|
|
120 "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\
|
|
121 "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\
|
|
122
|
|
123
|
3143
|
124 static inline void RENAME(yuv420_rgb16) (uint8_t * image, uint8_t * py,
|
2732
|
125 uint8_t * pu, uint8_t * pv,
|
|
126 int h_size, int v_size,
|
|
127 int rgb_stride, int y_stride, int uv_stride)
|
|
128 {
|
|
129 int even = 1;
|
|
130 int x, y;
|
|
131
|
|
132 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
|
|
133
|
|
134 for (y = v_size; --y >= 0; ) {
|
|
135 uint8_t *_image = image;
|
|
136 uint8_t *_py = py;
|
|
137 uint8_t *_pu = pu;
|
|
138 uint8_t *_pv = pv;
|
|
139
|
2749
|
140 b5Dither= dither8[y&1];
|
|
141 g6Dither= dither4[y&1];
|
|
142 g5Dither= dither8[y&1];
|
|
143 r5Dither= dither8[(y+1)&1];
|
|
144
|
2732
|
145 /* load data for start of next scan line */
|
|
146 __asm__ __volatile__ (
|
2734
|
147 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
|
|
148 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
|
|
149 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
|
2732
|
150
|
|
151 : : "r" (_py), "r" (_pu), "r" (_pv));
|
|
152
|
|
153 for (x = h_size >> 3; --x >= 0; ) {
|
|
154 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
|
|
155 pixels in each iteration */
|
|
156
|
|
157 __asm__ __volatile__ (
|
2736
|
158 /* no speed diference on my p3@500 with prefetch,
|
|
159 * if it is faster for anyone with -benchmark then tell me
|
|
160 PREFETCH" 64(%0) \n\t"
|
|
161 PREFETCH" 64(%1) \n\t"
|
|
162 PREFETCH" 64(%2) \n\t"
|
|
163 */
|
2734
|
164 YUV2RGB
|
2732
|
165
|
2749
|
166 #ifdef DITHER1XBPP
|
|
167 "paddusb b5Dither, %%mm0;"
|
|
168 "paddusb g6Dither, %%mm2;"
|
|
169 "paddusb r5Dither, %%mm1;"
|
|
170 #endif
|
2732
|
171 /* mask unneeded bits off */
|
2734
|
172 "pand mmx_redmask, %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
|
|
173 "pand mmx_grnmask, %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */
|
|
174 "pand mmx_redmask, %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
|
2732
|
175
|
2736
|
176 "psrlw $3,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
|
2734
|
177 "pxor %%mm4, %%mm4;" /* zero mm4 */
|
2732
|
178
|
2734
|
179 "movq %%mm0, %%mm5;" /* Copy B7-B0 */
|
|
180 "movq %%mm2, %%mm7;" /* Copy G7-G0 */
|
2732
|
181
|
|
182 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
|
2734
|
183 "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
|
|
184 "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
|
2732
|
185
|
2736
|
186 "psllw $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
|
2734
|
187 "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
|
2732
|
188
|
2734
|
189 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
|
|
190 MOVNTQ " %%mm0, (%3);" /* store pixel 0-3 */
|
2732
|
191
|
|
192 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
|
2734
|
193 "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
|
|
194 "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
|
2732
|
195
|
2736
|
196 "psllw $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
|
2734
|
197 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
|
2732
|
198
|
2734
|
199 "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
|
|
200 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
|
2732
|
201
|
2734
|
202 MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */
|
2732
|
203 : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
|
|
204
|
|
205 _py += 8;
|
|
206 _pu += 4;
|
|
207 _pv += 4;
|
|
208 _image += 16;
|
|
209 }
|
|
210
|
|
211 if (!even) {
|
|
212 pu += uv_stride;
|
|
213 pv += uv_stride;
|
|
214 }
|
|
215
|
|
216 py += y_stride;
|
|
217 image += rgb_stride;
|
|
218
|
|
219 even = (!even);
|
|
220 }
|
|
221
|
|
222 __asm__ __volatile__ (EMMS);
|
|
223 }
|
|
224
|
3143
|
225 static inline void RENAME(yuv420_rgb15) (uint8_t * image, uint8_t * py,
|
2735
|
226 uint8_t * pu, uint8_t * pv,
|
|
227 int h_size, int v_size,
|
|
228 int rgb_stride, int y_stride, int uv_stride)
|
|
229 {
|
|
230 int even = 1;
|
|
231 int x, y;
|
|
232
|
|
233 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
|
|
234
|
|
235 for (y = v_size; --y >= 0; ) {
|
|
236 uint8_t *_image = image;
|
|
237 uint8_t *_py = py;
|
|
238 uint8_t *_pu = pu;
|
|
239 uint8_t *_pv = pv;
|
|
240
|
2749
|
241 b5Dither= dither8[y&1];
|
|
242 g6Dither= dither4[y&1];
|
|
243 g5Dither= dither8[y&1];
|
|
244 r5Dither= dither8[(y+1)&1];
|
|
245
|
2735
|
246 /* load data for start of next scan line */
|
|
247 __asm__ __volatile__ (
|
|
248 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
|
|
249 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
|
|
250 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
|
|
251
|
|
252 : : "r" (_py), "r" (_pu), "r" (_pv));
|
|
253
|
|
254 for (x = h_size >> 3; --x >= 0; ) {
|
|
255 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
|
|
256 pixels in each iteration */
|
|
257
|
|
258 __asm__ __volatile__ (
|
|
259 YUV2RGB
|
|
260
|
2749
|
261 #ifdef DITHER1XBPP
|
|
262 "paddusb b5Dither, %%mm0 \n\t"
|
|
263 "paddusb g5Dither, %%mm2 \n\t"
|
|
264 "paddusb r5Dither, %%mm1 \n\t"
|
|
265 #endif
|
|
266
|
2735
|
267 /* mask unneeded bits off */
|
|
268 "pand mmx_redmask, %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
|
|
269 "pand mmx_redmask, %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */
|
|
270 "pand mmx_redmask, %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
|
|
271
|
2736
|
272 "psrlw $3,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
|
2735
|
273 "psrlw $1,%%mm1;" /* 0_r7r6r5 r4r3_0_0 0_r7r6r5 r4r3_0_0 */
|
|
274 "pxor %%mm4, %%mm4;" /* zero mm4 */
|
|
275
|
|
276 "movq %%mm0, %%mm5;" /* Copy B7-B0 */
|
|
277 "movq %%mm2, %%mm7;" /* Copy G7-G0 */
|
|
278
|
|
279 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
|
|
280 "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */
|
|
281 "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
|
|
282
|
|
283 "psllw $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
|
|
284 "por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
|
|
285
|
|
286 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
|
|
287 MOVNTQ " %%mm0, (%3);" /* store pixel 0-3 */
|
|
288
|
|
289 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
|
|
290 "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
|
|
291 "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
|
|
292
|
|
293 "psllw $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
|
|
294 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
|
|
295
|
|
296 "por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
|
|
297 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
|
|
298
|
|
299 MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */
|
|
300 : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
|
|
301
|
|
302 _py += 8;
|
|
303 _pu += 4;
|
|
304 _pv += 4;
|
|
305 _image += 16;
|
|
306 }
|
|
307
|
|
308 if (!even) {
|
|
309 pu += uv_stride;
|
|
310 pv += uv_stride;
|
|
311 }
|
|
312
|
|
313 py += y_stride;
|
|
314 image += rgb_stride;
|
|
315
|
|
316 even = (!even);
|
|
317 }
|
|
318
|
|
319 __asm__ __volatile__ (EMMS);
|
|
320 }
|
|
321
|
3143
|
322 static inline void RENAME(yuv420_rgb24) (uint8_t * image, uint8_t * py,
|
2734
|
323 uint8_t * pu, uint8_t * pv,
|
|
324 int h_size, int v_size,
|
|
325 int rgb_stride, int y_stride, int uv_stride)
|
|
326 {
|
|
327 int even = 1;
|
|
328 int x, y;
|
|
329
|
|
330 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
|
|
331
|
|
332 for (y = v_size; --y >= 0; ) {
|
|
333 uint8_t *_image = image;
|
|
334 uint8_t *_py = py;
|
|
335 uint8_t *_pu = pu;
|
|
336 uint8_t *_pv = pv;
|
|
337
|
|
338 /* load data for start of next scan line */
|
|
339 __asm__ __volatile__ (
|
|
340 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
|
|
341 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
|
|
342 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
|
|
343
|
|
344 : : "r" (_py), "r" (_pu), "r" (_pv));
|
|
345
|
|
346 for (x = h_size >> 3; --x >= 0; ) {
|
|
347 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
|
|
348 pixels in each iteration */
|
|
349
|
|
350 __asm__ __volatile__ (
|
|
351 YUV2RGB
|
2739
|
352 /* mm0=B, %%mm2=G, %%mm1=R */
|
|
353 #ifdef HAVE_MMX2
|
|
354 "movq M24A, %%mm4 \n\t"
|
|
355 "movq M24C, %%mm7 \n\t"
|
|
356 "pshufw $0x50, %%mm0, %%mm5 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */
|
|
357 "pshufw $0x50, %%mm2, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */
|
|
358 "pshufw $0x00, %%mm1, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */
|
2734
|
359
|
2739
|
360 "pand %%mm4, %%mm5 \n\t" /* B2 B1 B0 */
|
|
361 "pand %%mm4, %%mm3 \n\t" /* G2 G1 G0 */
|
|
362 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */
|
|
363
|
|
364 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */
|
|
365 "por %%mm5, %%mm6 \n\t"
|
|
366 "por %%mm3, %%mm6 \n\t"
|
|
367 MOVNTQ" %%mm6, (%3) \n\t"
|
|
368
|
|
369 "psrlq $8, %%mm2 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */
|
|
370 "pshufw $0xA5, %%mm0, %%mm5 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */
|
|
371 "pshufw $0x55, %%mm2, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */
|
|
372 "pshufw $0xA5, %%mm1, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */
|
|
373
|
|
374 "pand M24B, %%mm5 \n\t" /* B5 B4 B3 */
|
|
375 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */
|
|
376 "pand %%mm4, %%mm6 \n\t" /* R4 R3 R2 */
|
|
377
|
|
378 "por %%mm5, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */
|
|
379 "por %%mm3, %%mm6 \n\t"
|
|
380 MOVNTQ" %%mm6, 8(%3) \n\t"
|
|
381
|
|
382 "pshufw $0xFF, %%mm0, %%mm5 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */
|
|
383 "pshufw $0xFA, %%mm2, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */
|
|
384 "pshufw $0xFA, %%mm1, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */
|
|
385 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
|
|
386
|
|
387 "pand %%mm7, %%mm5 \n\t" /* B7 B6 */
|
|
388 "pand %%mm4, %%mm3 \n\t" /* G7 G6 G5 */
|
|
389 "pand M24B, %%mm6 \n\t" /* R7 R6 R5 */
|
|
390 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
|
|
391 \
|
|
392 "por %%mm5, %%mm3 \n\t"
|
|
393 "por %%mm3, %%mm6 \n\t"
|
|
394 MOVNTQ" %%mm6, 16(%3) \n\t"
|
|
395 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
|
|
396 "pxor %%mm4, %%mm4 \n\t"
|
|
397
|
|
398 #else
|
|
399
|
2734
|
400 "pxor %%mm4, %%mm4 \n\t"
|
|
401 "movq %%mm0, %%mm5 \n\t" /* B */
|
|
402 "movq %%mm1, %%mm6 \n\t" /* R */
|
|
403 "punpcklbw %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */
|
|
404 "punpcklbw %%mm4, %%mm1 \n\t" /* 0R0R0R0R 0 */
|
|
405 "punpckhbw %%mm2, %%mm5 \n\t" /* GBGBGBGB 2 */
|
|
406 "punpckhbw %%mm4, %%mm6 \n\t" /* 0R0R0R0R 2 */
|
|
407 "movq %%mm0, %%mm7 \n\t" /* GBGBGBGB 0 */
|
|
408 "movq %%mm5, %%mm3 \n\t" /* GBGBGBGB 2 */
|
|
409 "punpcklwd %%mm1, %%mm7 \n\t" /* 0RGB0RGB 0 */
|
|
410 "punpckhwd %%mm1, %%mm0 \n\t" /* 0RGB0RGB 1 */
|
|
411 "punpcklwd %%mm6, %%mm5 \n\t" /* 0RGB0RGB 2 */
|
|
412 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */
|
|
413
|
|
414 "movq %%mm7, %%mm2 \n\t" /* 0RGB0RGB 0 */
|
|
415 "movq %%mm0, %%mm6 \n\t" /* 0RGB0RGB 1 */
|
|
416 "movq %%mm5, %%mm1 \n\t" /* 0RGB0RGB 2 */
|
|
417 "movq %%mm3, %%mm4 \n\t" /* 0RGB0RGB 3 */
|
|
418
|
|
419 "psllq $40, %%mm7 \n\t" /* RGB00000 0 */
|
|
420 "psllq $40, %%mm0 \n\t" /* RGB00000 1 */
|
|
421 "psllq $40, %%mm5 \n\t" /* RGB00000 2 */
|
|
422 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */
|
|
423
|
|
424 "punpckhdq %%mm2, %%mm7 \n\t" /* 0RGBRGB0 0 */
|
|
425 "punpckhdq %%mm6, %%mm0 \n\t" /* 0RGBRGB0 1 */
|
|
426 "punpckhdq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */
|
|
427 "punpckhdq %%mm4, %%mm3 \n\t" /* 0RGBRGB0 3 */
|
|
428
|
|
429 "psrlq $8, %%mm7 \n\t" /* 00RGBRGB 0 */
|
|
430 "movq %%mm0, %%mm6 \n\t" /* 0RGBRGB0 1 */
|
|
431 "psllq $40, %%mm0 \n\t" /* GB000000 1 */
|
|
432 "por %%mm0, %%mm7 \n\t" /* GBRGBRGB 0 */
|
|
433 MOVNTQ" %%mm7, (%3) \n\t"
|
|
434
|
|
435 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
|
|
436
|
|
437 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */
|
|
438 "movq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */
|
|
439 "psllq $24, %%mm5 \n\t" /* BRGB0000 2 */
|
|
440 "por %%mm5, %%mm6 \n\t" /* BRGBRGBR 1 */
|
|
441 MOVNTQ" %%mm6, 8(%3) \n\t"
|
|
442
|
|
443 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
|
|
444
|
|
445 "psrlq $40, %%mm1 \n\t" /* 000000RG 2 */
|
|
446 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */
|
|
447 "por %%mm3, %%mm1 \n\t" /* RGBRGBRG 2 */
|
|
448 MOVNTQ" %%mm1, 16(%3) \n\t"
|
|
449
|
|
450 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
|
|
451 "pxor %%mm4, %%mm4 \n\t"
|
2739
|
452 #endif
|
2734
|
453
|
|
454 : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
|
|
455
|
|
456 _py += 8;
|
|
457 _pu += 4;
|
|
458 _pv += 4;
|
|
459 _image += 24;
|
|
460 }
|
|
461
|
|
462 if (!even) {
|
|
463 pu += uv_stride;
|
|
464 pv += uv_stride;
|
|
465 }
|
|
466
|
|
467 py += y_stride;
|
|
468 image += rgb_stride;
|
|
469
|
|
470 even = (!even);
|
|
471 }
|
|
472
|
|
473 __asm__ __volatile__ (EMMS);
|
|
474 }
|
|
475
|
|
476
|
3143
|
477 static inline void RENAME(yuv420_argb32) (uint8_t * image, uint8_t * py,
|
2732
|
478 uint8_t * pu, uint8_t * pv,
|
|
479 int h_size, int v_size,
|
|
480 int rgb_stride, int y_stride, int uv_stride)
|
|
481 {
|
|
482 int even = 1;
|
|
483 int x, y;
|
|
484
|
|
485 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
|
|
486
|
|
487 for (y = v_size; --y >= 0; ) {
|
|
488 uint8_t *_image = image;
|
|
489 uint8_t *_py = py;
|
|
490 uint8_t *_pu = pu;
|
|
491 uint8_t *_pv = pv;
|
|
492
|
|
493 /* load data for start of next scan line */
|
|
494 __asm__ __volatile__
|
|
495 (
|
|
496 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
|
|
497 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
|
|
498 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
|
|
499 : : "r" (_py), "r" (_pu), "r" (_pv)
|
|
500 );
|
|
501
|
|
502 for (x = h_size >> 3; --x >= 0; ) {
|
|
503 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
|
|
504 pixels in each iteration */
|
|
505 __asm__ __volatile__ (
|
2734
|
506 YUV2RGB
|
|
507 /* convert RGB plane to RGB packed format,
|
2732
|
508 mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
|
|
509 mm4 -> GB, mm5 -> AR pixel 4-7,
|
|
510 mm6 -> GB, mm7 -> AR pixel 0-3 */
|
2734
|
511 "pxor %%mm3, %%mm3;" /* zero mm3 */
|
2732
|
512
|
2734
|
513 "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
|
|
514 "movq %%mm1, %%mm7;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
|
2732
|
515
|
2734
|
516 "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
|
|
517 "movq %%mm1, %%mm5;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
|
2732
|
518
|
2734
|
519 "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
|
|
520 "punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */
|
2732
|
521
|
2734
|
522 "punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */
|
|
523 MOVNTQ " %%mm6, (%3);" /* Store ARGB1 ARGB0 */
|
2732
|
524
|
2734
|
525 "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
|
|
526 "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
|
2732
|
527
|
2734
|
528 "punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */
|
|
529 MOVNTQ " %%mm6, 8 (%3);" /* Store ARGB3 ARGB2 */
|
2732
|
530
|
2734
|
531 "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
|
|
532 "punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */
|
|
533
|
|
534 "punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */
|
|
535 MOVNTQ " %%mm4, 16 (%3);" /* Store ARGB5 ARGB4 */
|
2732
|
536
|
2734
|
537 "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
|
|
538 "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
|
2732
|
539
|
2734
|
540 "punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */
|
|
541 MOVNTQ " %%mm4, 24 (%3);" /* Store ARGB7 ARGB6 */
|
2732
|
542
|
2734
|
543 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
|
|
544 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
|
2732
|
545
|
2734
|
546 "pxor %%mm4, %%mm4;" /* zero mm4 */
|
|
547 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
|
2732
|
548
|
|
549 : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
|
|
550
|
|
551 _py += 8;
|
|
552 _pu += 4;
|
|
553 _pv += 4;
|
|
554 _image += 32;
|
|
555 }
|
|
556
|
|
557 if (!even) {
|
|
558 pu += uv_stride;
|
|
559 pv += uv_stride;
|
|
560 }
|
|
561
|
|
562 py += y_stride;
|
|
563 image += rgb_stride;
|
|
564
|
|
565 even = (!even);
|
|
566 }
|
|
567
|
|
568 __asm__ __volatile__ (EMMS);
|
|
569 }
|
|
570
|
3143
|
571 yuv2rgb_fun RENAME(yuv2rgb_init) (int bpp, int mode)
|
2732
|
572 {
|
3143
|
573 if (bpp == 15 && mode == MODE_RGB) return RENAME(yuv420_rgb15);
|
|
574 if (bpp == 16 && mode == MODE_RGB) return RENAME(yuv420_rgb16);
|
|
575 if (bpp == 24 && mode == MODE_RGB) return RENAME(yuv420_rgb24);
|
|
576 if (bpp == 32 && mode == MODE_RGB) return RENAME(yuv420_argb32);
|
2732
|
577 return NULL; // Fallback to C.
|
|
578 }
|
|
579
|