Mercurial > mplayer.hg
comparison libvo/yuv2rgb_mmx.c @ 1101:961f53221ffc
Code cleanup and fix missing config.h and use femms on K6 2/2+/3.
author | atmosfear |
---|---|
date | Mon, 11 Jun 2001 17:43:15 +0000 |
parents | ed6ac3915d59 |
children | 7ce37211e454 |
comparison
equal
deleted
inserted
replaced
1100:7829676dc4ef | 1101:961f53221ffc |
---|---|
25 * | 25 * |
26 */ | 26 */ |
27 | 27 |
28 #include <stdio.h> | 28 #include <stdio.h> |
29 #include <stdlib.h> | 29 #include <stdlib.h> |
30 | |
31 #include "../config.h" | |
30 | 32 |
31 #include "mmx.h" | 33 #include "mmx.h" |
32 //#include "libmpeg2/mpeg2.h" | 34 //#include "libmpeg2/mpeg2.h" |
33 //#include "libmpeg2/mpeg2_internal.h" | 35 //#include "libmpeg2/mpeg2_internal.h" |
34 #include <inttypes.h> | 36 #include <inttypes.h> |
52 uint64_t mmx_grnmask = 0xfcfcfcfcfcfcfcfc; | 54 uint64_t mmx_grnmask = 0xfcfcfcfcfcfcfcfc; |
53 uint64_t mmx_grnshift = 0x03; | 55 uint64_t mmx_grnshift = 0x03; |
54 uint64_t mmx_blueshift = 0x03; | 56 uint64_t mmx_blueshift = 0x03; |
55 | 57 |
56 #ifdef HAVE_MMX2 | 58 #ifdef HAVE_MMX2 |
57 #define movntq "movntq" // use this for K7 and p3 only | 59 /* use this for K7 and p3 only */ |
60 #define MOVNTQ "movntq" | |
58 #else | 61 #else |
59 #define movntq "movq" // for MMX-only processors | 62 /* for MMX-only processors */ |
63 #define MOVNTQ "movq" | |
64 #endif | |
65 | |
66 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW) | |
67 /* for K6 2/2+/3 */ | |
68 #define EMMS "femms;" | |
69 #else | |
70 #define EMMS "emms;" | |
60 #endif | 71 #endif |
61 | 72 |
62 static void yuv420_rgb16_mmx (uint8_t * image, uint8_t * py, | 73 static void yuv420_rgb16_mmx (uint8_t * image, uint8_t * py, |
63 uint8_t * pu, uint8_t * pv, | 74 uint8_t * pu, uint8_t * pv, |
64 int h_size, int v_size, | 75 int h_size, int v_size, |
66 { | 77 { |
67 int even = 1; | 78 int even = 1; |
68 int x = 0, y = 0; | 79 int x = 0, y = 0; |
69 | 80 |
70 /* load data for first scan line */ | 81 /* load data for first scan line */ |
71 __asm__ ( | 82 __asm__ __volatile__ ( |
72 "movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0\n\t" | 83 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
73 "movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0\n\t" | 84 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
74 | 85 |
75 "pxor %%mm4, %%mm4 # zero mm4\n\t" | 86 "pxor %%mm4, %%mm4;" /* zero mm4 */ |
76 "movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0\n\t" | 87 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
77 | 88 |
78 //"movl $0, (%3) # cache preload for image\n\t" | 89 //"movl $0, (%3);" /* cache preload for image */ |
79 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); | 90 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); |
80 | 91 |
81 do { | 92 do { |
82 do { | 93 do { |
83 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 | 94 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 |
84 pixels in each iteration */ | 95 pixels in each iteration */ |
85 __asm__ (".align 8 \n\t" | 96 __asm__ __volatile__ (".align 8;" |
86 /* Do the multiply part of the conversion for even and odd pixels, | 97 /* Do the multiply part of the conversion for even and odd pixels, |
87 register usage: | 98 register usage: |
88 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, | 99 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, |
89 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, | 100 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, |
90 mm6 -> Y even, mm7 -> Y odd */ | 101 mm6 -> Y even, mm7 -> Y odd */ |
91 /* convert the chroma part */ | 102 /* convert the chroma part */ |
92 "punpcklbw %%mm4, %%mm0 # scatter 4 Cb 00 u3 00 u2 00 u1 00 u0\n\t" | 103 "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ |
93 "punpcklbw %%mm4, %%mm1 # scatter 4 Cr 00 v3 00 v2 00 v1 00 v0\n\t" | 104 "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ |
94 | 105 |
95 "psubsw mmx_80w, %%mm0 # Cb -= 128\n\t" | 106 "psubsw mmx_80w, %%mm0;" /* Cb -= 128 */ |
96 "psubsw mmx_80w, %%mm1 # Cr -= 128\n\t" | 107 "psubsw mmx_80w, %%mm1;" /* Cr -= 128 */ |
97 | 108 |
98 "psllw $3, %%mm0 # Promote precision\n\t" | 109 "psllw $3, %%mm0;" /* Promote precision */ |
99 "psllw $3, %%mm1 # Promote precision\n\t" | 110 "psllw $3, %%mm1;" /* Promote precision */ |
100 | 111 |
101 "movq %%mm0, %%mm2 # Copy 4 Cb 00 u3 00 u2 00 u1 00 u0\n\t" | 112 "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ |
102 "movq %%mm1, %%mm3 # Copy 4 Cr 00 v3 00 v2 00 v1 00 v0\n\t" | 113 "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ |
103 | 114 |
104 "pmulhw mmx_U_green, %%mm2# Mul Cb with green coeff -> Cb green\n\t" | 115 "pmulhw mmx_U_green, %%mm2;" /* Mul Cb with green coeff -> Cb green */ |
105 "pmulhw mmx_V_green, %%mm3# Mul Cr with green coeff -> Cr green\n\t" | 116 "pmulhw mmx_V_green, %%mm3;" /* Mul Cr with green coeff -> Cr green */ |
106 | 117 |
107 "pmulhw mmx_U_blue, %%mm0 # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0\n\t" | 118 "pmulhw mmx_U_blue, %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */ |
108 "pmulhw mmx_V_red, %%mm1 # Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0\n\t" | 119 "pmulhw mmx_V_red, %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */ |
109 | 120 |
110 "paddsw %%mm3, %%mm2 # Cb green + Cr green -> Cgreen\n\t" | 121 "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */ |
111 | 122 |
112 /* convert the luma part */ | 123 /* convert the luma part */ |
113 "psubusb mmx_10w, %%mm6 # Y -= 16\n\t" | 124 "psubusb mmx_10w, %%mm6;" /* Y -= 16 */ |
114 | 125 |
115 "movq %%mm6, %%mm7 # Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0\n\t" | 126 "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
116 "pand mmx_00ffw, %%mm6 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0\n\t" | 127 "pand mmx_00ffw, %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */ |
117 | 128 |
118 "psrlw $8, %%mm7 # get Y odd 00 Y7 00 Y5 00 Y3 00 Y1\n\t" | 129 "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */ |
119 | 130 |
120 "psllw $3, %%mm6 # Promote precision\n\t" | 131 "psllw $3, %%mm6;" /* Promote precision */ |
121 "psllw $3, %%mm7 # Promote precision\n\t" | 132 "psllw $3, %%mm7;" /* Promote precision */ |
122 | 133 |
123 "pmulhw mmx_Y_coeff, %%mm6# Mul 4 Y even 00 y6 00 y4 00 y2 00 y0\n\t" | 134 "pmulhw mmx_Y_coeff, %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */ |
124 "pmulhw mmx_Y_coeff, %%mm7# Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1\n\t" | 135 "pmulhw mmx_Y_coeff, %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */ |
125 | 136 |
126 /* Do the addition part of the conversion for even and odd pixels, | 137 /* Do the addition part of the conversion for even and odd pixels, |
127 register usage: | 138 register usage: |
128 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, | 139 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, |
129 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, | 140 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, |
130 mm6 -> Y even, mm7 -> Y odd */ | 141 mm6 -> Y even, mm7 -> Y odd */ |
131 "movq %%mm0, %%mm3 # Copy Cblue\n\t" | 142 "movq %%mm0, %%mm3;" /* Copy Cblue */ |
132 "movq %%mm1, %%mm4 # Copy Cred\n\t" | 143 "movq %%mm1, %%mm4;" /* Copy Cred */ |
133 "movq %%mm2, %%mm5 # Copy Cgreen\n\t" | 144 "movq %%mm2, %%mm5;" /* Copy Cgreen */ |
134 | 145 |
135 "paddsw %%mm6, %%mm0 # Y even + Cblue 00 B6 00 B4 00 B2 00 B0\n\t" | 146 "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */ |
136 "paddsw %%mm7, %%mm3 # Y odd + Cblue 00 B7 00 B5 00 B3 00 B1\n\t" | 147 "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */ |
137 | 148 |
138 "paddsw %%mm6, %%mm1 # Y even + Cred 00 R6 00 R4 00 R2 00 R0\n\t" | 149 "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */ |
139 "paddsw %%mm7, %%mm4 # Y odd + Cred 00 R7 00 R5 00 R3 00 R1\n\t" | 150 "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */ |
140 | 151 |
141 "paddsw %%mm6, %%mm2 # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0\n\t" | 152 "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */ |
142 "paddsw %%mm7, %%mm5 # Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1\n\t" | 153 "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */ |
143 | 154 |
144 /* Limit RGB even to 0..255 */ | 155 /* Limit RGB even to 0..255 */ |
145 "packuswb %%mm0, %%mm0 # B6 B4 B2 B0 B6 B4 B2 B0\n\t" | 156 "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */ |
146 "packuswb %%mm1, %%mm1 # R6 R4 R2 R0 R6 R4 R2 R0\n\t" | 157 "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */ |
147 "packuswb %%mm2, %%mm2 # G6 G4 G2 G0 G6 G4 G2 G0\n\t" | 158 "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */ |
148 | 159 |
149 /* Limit RGB odd to 0..255 */ | 160 /* Limit RGB odd to 0..255 */ |
150 "packuswb %%mm3, %%mm3 # B7 B5 B3 B1 B7 B5 B3 B1\n\t" | 161 "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */ |
151 "packuswb %%mm4, %%mm4 # R7 R5 R3 R1 R7 R5 R3 R1\n\t" | 162 "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */ |
152 "packuswb %%mm5, %%mm5 # G7 G5 G3 G1 G7 G5 G3 G1\n\t" | 163 "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */ |
153 | 164 |
154 /* Interleave RGB even and odd */ | 165 /* Interleave RGB even and odd */ |
155 "punpcklbw %%mm3, %%mm0 # B7 B6 B5 B4 B3 B2 B1 B0\n\t" | 166 "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ |
156 "punpcklbw %%mm4, %%mm1 # R7 R6 R5 R4 R3 R2 R1 R0\n\t" | 167 "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ |
157 "punpcklbw %%mm5, %%mm2 # G7 G6 G5 G4 G3 G2 G1 G0\n\t" | 168 "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */ |
158 | 169 |
159 /* mask unneeded bits off */ | 170 /* mask unneeded bits off */ |
160 "pand mmx_redmask, %%mm0# b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0\n\t" | 171 "pand mmx_redmask, %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */ |
161 "pand mmx_grnmask, %%mm2# g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0\n\t" | 172 "pand mmx_grnmask, %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */ |
162 "pand mmx_redmask, %%mm1# r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0\n\t" | 173 "pand mmx_redmask, %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */ |
163 | 174 |
164 "psrlw mmx_blueshift,%%mm0#0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3\n\t" | 175 "psrlw mmx_blueshift,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */ |
165 "pxor %%mm4, %%mm4 # zero mm4\n\t" | 176 "pxor %%mm4, %%mm4;" /* zero mm4 */ |
166 | 177 |
167 "movq %%mm0, %%mm5 # Copy B7-B0\n\t" | 178 "movq %%mm0, %%mm5;" /* Copy B7-B0 */ |
168 "movq %%mm2, %%mm7 # Copy G7-G0\n\t" | 179 "movq %%mm2, %%mm7;" /* Copy G7-G0 */ |
169 | 180 |
170 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ | 181 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ |
171 "punpcklbw %%mm4, %%mm2 # 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0\n\t" | 182 "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ |
172 "punpcklbw %%mm1, %%mm0 # r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3\n\t" | 183 "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ |
173 | 184 |
174 "psllw mmx_blueshift,%%mm2# 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0\n\t" | 185 "psllw mmx_blueshift,%%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */ |
175 "por %%mm2, %%mm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3\n\t" | 186 "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ |
176 | 187 |
177 "movq 8 (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0\n\t" | 188 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
178 movntq " %%mm0, (%3) # store pixel 0-3\n\t" | 189 MOVNTQ " %%mm0, (%3);" /* store pixel 0-3 */ |
179 | 190 |
180 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ | 191 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ |
181 "punpckhbw %%mm4, %%mm7 # 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0\n\t" | 192 "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ |
182 "punpckhbw %%mm1, %%mm5 # r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3\n\t" | 193 "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ |
183 | 194 |
184 "psllw mmx_blueshift,%%mm7# 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0\n\t" | 195 "psllw mmx_blueshift,%%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */ |
185 "movd 4 (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0\n\t" | 196 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
186 | 197 |
187 "por %%mm7, %%mm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3\n\t" | 198 "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ |
188 "movd 4 (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0\n\t" | 199 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
189 | 200 |
190 movntq " %%mm5, 8 (%3) # store pixel 4-7\n\t" | 201 MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */ |
191 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); | 202 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); |
192 | 203 |
193 py += 8; | 204 py += 8; |
194 pu += 4; | 205 pu += 4; |
195 pv += 4; | 206 pv += 4; |
207 | 218 |
208 py += (y_stride - h_size); | 219 py += (y_stride - h_size); |
209 image += (rgb_stride - 2*h_size); | 220 image += (rgb_stride - 2*h_size); |
210 | 221 |
211 /* load data for start of next scan line */ | 222 /* load data for start of next scan line */ |
212 __asm__ ( | 223 __asm__ __volatile__ ( |
213 "movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 00 u3 u2 u1 u0\n\t" | 224 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 00 u3 u2 u1 u0 */ |
214 "movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 00 v2 v1 v0\n\t" | 225 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 00 v2 v1 v0 */ |
215 | 226 |
216 //"movl $0, (%3) # cache preload for image\n\t" | 227 //"movl $0, (%3);" /* cache preload for image */ |
217 "movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0\n\t" | 228 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
218 | 229 |
219 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); | 230 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); |
220 | 231 |
221 x = 0; | 232 x = 0; |
222 y += 1; | 233 y += 1; |
223 even = (!even); | 234 even = (!even); |
224 } while (y < v_size) ; | 235 } while (y < v_size) ; |
225 | 236 |
226 __asm__ ("emms\n\t"); | 237 __asm__ __volatile__ (EMMS); |
227 } | 238 } |
228 | 239 |
229 static void yuv420_argb32_mmx (uint8_t * image, uint8_t * py, | 240 static void yuv420_argb32_mmx (uint8_t * image, uint8_t * py, |
230 uint8_t * pu, uint8_t * pv, | 241 uint8_t * pu, uint8_t * pv, |
231 int h_size, int v_size, | 242 int h_size, int v_size, |
232 int rgb_stride, int y_stride, int uv_stride) | 243 int rgb_stride, int y_stride, int uv_stride) |
233 { | 244 { |
234 int even = 1; | 245 int even = 1; |
235 int x = 0, y = 0; | 246 int x = 0, y = 0; |
236 | 247 |
237 __asm__ ( | 248 __asm__ __volatile__ ( |
238 ".align 8 \n\t" | 249 ".align 8;" |
239 "movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0\n\t" | 250 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
240 //"movl $0, (%3) # cache preload for image\n\t" | 251 //"movl $0, (%3);" /* cache preload for image */ |
241 | 252 |
242 "movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0\n\t" | 253 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
243 "pxor %%mm4, %%mm4 # zero mm4\n\t" | 254 "pxor %%mm4, %%mm4;" /* zero mm4 */ |
244 | 255 |
245 "movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0\n\t" | 256 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
246 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); | 257 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); |
247 | 258 |
248 do { | 259 do { |
249 do { | 260 do { |
250 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 | 261 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 |
251 pixels in each iteration */ | 262 pixels in each iteration */ |
252 __asm__ ( | 263 __asm__ __volatile__ ( |
253 ".align 8 \n\t" | 264 ".align 8;" |
254 /* Do the multiply part of the conversion for even and odd pixels, | 265 /* Do the multiply part of the conversion for even and odd pixels, |
255 register usage: | 266 register usage: |
256 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, | 267 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, |
257 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, | 268 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, |
258 mm6 -> Y even, mm7 -> Y odd */ | 269 mm6 -> Y even, mm7 -> Y odd */ |
259 | 270 |
260 /* convert the chroma part */ | 271 /* convert the chroma part */ |
261 "punpcklbw %%mm4, %%mm0 # scatter 4 Cb 00 u3 00 u2 00 u1 00 u0\n\t" | 272 "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ |
262 "punpcklbw %%mm4, %%mm1 # scatter 4 Cr 00 v3 00 v2 00 v1 00 v0\n\t" | 273 "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ |
263 | 274 |
264 "psubsw mmx_80w, %%mm0 # Cb -= 128\n\t" | 275 "psubsw mmx_80w, %%mm0;" /* Cb -= 128 */ |
265 "psubsw mmx_80w, %%mm1 # Cr -= 128\n\t" | 276 "psubsw mmx_80w, %%mm1;" /* Cr -= 128 */ |
266 | 277 |
267 "psllw $3, %%mm0 # Promote precision\n\t" | 278 "psllw $3, %%mm0;" /* Promote precision */ |
268 "psllw $3, %%mm1 # Promote precision\n\t" | 279 "psllw $3, %%mm1;" /* Promote precision */ |
269 | 280 |
270 "movq %%mm0, %%mm2 # Copy 4 Cb 00 u3 00 u2 00 u1 00 u0\n\t" | 281 "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ |
271 "movq %%mm1, %%mm3 # Copy 4 Cr 00 v3 00 v2 00 v1 00 v0\n\t" | 282 "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ |
272 | 283 |
273 "pmulhw mmx_U_green, %%mm2# Mul Cb with green coeff -> Cb green\n\t" | 284 "pmulhw mmx_U_green, %%mm2;" /* Mul Cb with green coeff -> Cb green */ |
274 "pmulhw mmx_V_green, %%mm3# Mul Cr with green coeff -> Cr green\n\t" | 285 "pmulhw mmx_V_green, %%mm3;" /* Mul Cr with green coeff -> Cr green */ |
275 | 286 |
276 "pmulhw mmx_U_blue, %%mm0 # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0\n\t" | 287 "pmulhw mmx_U_blue, %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */ |
277 "pmulhw mmx_V_red, %%mm1 # Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0\n\t" | 288 "pmulhw mmx_V_red, %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */ |
278 | 289 |
279 "paddsw %%mm3, %%mm2 # Cb green + Cr green -> Cgreen\n\t" | 290 "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */ |
280 | 291 |
281 /* convert the luma part */ | 292 /* convert the luma part */ |
282 "psubusb mmx_10w, %%mm6 # Y -= 16\n\t" | 293 "psubusb mmx_10w, %%mm6;" /* Y -= 16 */ |
283 | 294 |
284 "movq %%mm6, %%mm7 # Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0\n\t" | 295 "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
285 "pand mmx_00ffw, %%mm6 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0\n\t" | 296 "pand mmx_00ffw, %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */ |
286 | 297 |
287 "psrlw $8, %%mm7 # get Y odd 00 Y7 00 Y5 00 Y3 00 Y1\n\t" | 298 "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */ |
288 | 299 |
289 "psllw $3, %%mm6 # Promote precision\n\t" | 300 "psllw $3, %%mm6;" /* Promote precision */ |
290 "psllw $3, %%mm7 # Promote precision\n\t" | 301 "psllw $3, %%mm7;" /* Promote precision */ |
291 | 302 |
292 "pmulhw mmx_Y_coeff, %%mm6# Mul 4 Y even 00 y6 00 y4 00 y2 00 y0\n\t" | 303 "pmulhw mmx_Y_coeff, %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */ |
293 "pmulhw mmx_Y_coeff, %%mm7# Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1\n\t" | 304 "pmulhw mmx_Y_coeff, %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */ |
294 | 305 |
295 /* Do the addition part of the conversion for even and odd pixels, | 306 /* Do the addition part of the conversion for even and odd pixels, |
296 register usage: | 307 register usage: |
297 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, | 308 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, |
298 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, | 309 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, |
299 mm6 -> Y even, mm7 -> Y odd */ | 310 mm6 -> Y even, mm7 -> Y odd */ |
300 | 311 |
301 "movq %%mm0, %%mm3 # Copy Cblue\n\t" | 312 "movq %%mm0, %%mm3;" /* Copy Cblue */ |
302 "movq %%mm1, %%mm4 # Copy Cred\n\t" | 313 "movq %%mm1, %%mm4;" /* Copy Cred */ |
303 "movq %%mm2, %%mm5 # Copy Cgreen\n\t" | 314 "movq %%mm2, %%mm5;" /* Copy Cgreen */ |
304 | 315 |
305 "paddsw %%mm6, %%mm0 # Y even + Cblue 00 B6 00 B4 00 B2 00 B0\n\t" | 316 "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */ |
306 "paddsw %%mm7, %%mm3 # Y odd + Cblue 00 B7 00 B5 00 B3 00 B1\n\t" | 317 "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */ |
307 | 318 |
308 "paddsw %%mm6, %%mm1 # Y even + Cred 00 R6 00 R4 00 R2 00 R0\n\t" | 319 "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */ |
309 "paddsw %%mm7, %%mm4 # Y odd + Cred 00 R7 00 R5 00 R3 00 R1\n\t" | 320 "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */ |
310 | 321 |
311 "paddsw %%mm6, %%mm2 # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0\n\t" | 322 "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */ |
312 "paddsw %%mm7, %%mm5 # Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1\n\t" | 323 "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */ |
313 | 324 |
314 /* Limit RGB even to 0..255 */ | 325 /* Limit RGB even to 0..255 */ |
315 "packuswb %%mm0, %%mm0 # B6 B4 B2 B0 B6 B4 B2 B0\n\t" | 326 "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */ |
316 "packuswb %%mm1, %%mm1 # R6 R4 R2 R0 R6 R4 R2 R0\n\t" | 327 "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */ |
317 "packuswb %%mm2, %%mm2 # G6 G4 G2 G0 G6 G4 G2 G0\n\t" | 328 "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */ |
318 | 329 |
319 /* Limit RGB odd to 0..255 */ | 330 /* Limit RGB odd to 0..255 */ |
320 "packuswb %%mm3, %%mm3 # B7 B5 B3 B1 B7 B5 B3 B1\n\t" | 331 "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */ |
321 "packuswb %%mm4, %%mm4 # R7 R5 R3 R1 R7 R5 R3 R1\n\t" | 332 "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */ |
322 "packuswb %%mm5, %%mm5 # G7 G5 G3 G1 G7 G5 G3 G1\n\t" | 333 "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */ |
323 | 334 |
324 /* Interleave RGB even and odd */ | 335 /* Interleave RGB even and odd */ |
325 "punpcklbw %%mm3, %%mm0 # B7 B6 B5 B4 B3 B2 B1 B0\n\t" | 336 "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ |
326 "punpcklbw %%mm4, %%mm1 # R7 R6 R5 R4 R3 R2 R1 R0\n\t" | 337 "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ |
327 "punpcklbw %%mm5, %%mm2 # G7 G6 G5 G4 G3 G2 G1 G0\n\t" | 338 "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */ |
328 | 339 |
329 /* convert RGB plane to RGB packed format, | 340 /* convert RGB plane to RGB packed format, |
330 mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0, | 341 mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0, |
331 mm4 -> GB, mm5 -> AR pixel 4-7, | 342 mm4 -> GB, mm5 -> AR pixel 4-7, |
332 mm6 -> GB, mm7 -> AR pixel 0-3 */ | 343 mm6 -> GB, mm7 -> AR pixel 0-3 */ |
333 "pxor %%mm3, %%mm3 # zero mm3\n\t" | 344 "pxor %%mm3, %%mm3;" /* zero mm3 */ |
334 | 345 |
335 "movq %%mm0, %%mm6 # B7 B6 B5 B4 B3 B2 B1 B0\n\t" | 346 "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ |
336 "movq %%mm1, %%mm7 # R7 R6 R5 R4 R3 R2 R1 R0\n\t" | 347 "movq %%mm1, %%mm7;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ |
337 | 348 |
338 "movq %%mm0, %%mm4 # B7 B6 B5 B4 B3 B2 B1 B0\n\t" | 349 "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ |
339 "movq %%mm1, %%mm5 # R7 R6 R5 R4 R3 R2 R1 R0\n\t" | 350 "movq %%mm1, %%mm5;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ |
340 | 351 |
341 "punpcklbw %%mm2, %%mm6 # G3 B3 G2 B2 G1 B1 G0 B0\n\t" | 352 "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */ |
342 "punpcklbw %%mm3, %%mm7 # 00 R3 00 R2 00 R1 00 R0\n\t" | 353 "punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */ |
343 | 354 |
344 "punpcklwd %%mm7, %%mm6 # 00 R1 B1 G1 00 R0 B0 G0\n\t" | 355 "punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */ |
345 movntq " %%mm6, (%3) # Store ARGB1 ARGB0\n\t" | 356 MOVNTQ " %%mm6, (%3);" /* Store ARGB1 ARGB0 */ |
346 | 357 |
347 "movq %%mm0, %%mm6 # B7 B6 B5 B4 B3 B2 B1 B0\n\t" | 358 "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ |
348 "punpcklbw %%mm2, %%mm6 # G3 B3 G2 B2 G1 B1 G0 B0\n\t" | 359 "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */ |
349 | 360 |
350 "punpckhwd %%mm7, %%mm6 # 00 R3 G3 B3 00 R2 B3 G2\n\t" | 361 "punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */ |
351 movntq " %%mm6, 8 (%3) # Store ARGB3 ARGB2\n\t" | 362 MOVNTQ " %%mm6, 8 (%3);" /* Store ARGB3 ARGB2 */ |
352 | 363 |
353 "punpckhbw %%mm2, %%mm4 # G7 B7 G6 B6 G5 B5 G4 B4\n\t" | 364 "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */ |
354 "punpckhbw %%mm3, %%mm5 # 00 R7 00 R6 00 R5 00 R4\n\t" | 365 "punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */ |
355 | 366 |
356 "punpcklwd %%mm5, %%mm4 # 00 R5 B5 G5 00 R4 B4 G4\n\t" | 367 "punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */ |
357 movntq " %%mm4, 16 (%3) # Store ARGB5 ARGB4\n\t" | 368 MOVNTQ " %%mm4, 16 (%3);" /* Store ARGB5 ARGB4 */ |
358 | 369 |
359 "movq %%mm0, %%mm4 # B7 B6 B5 B4 B3 B2 B1 B0\n\t" | 370 "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ |
360 "punpckhbw %%mm2, %%mm4 # G7 B7 G6 B6 G5 B5 G4 B4\n\t" | 371 "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */ |
361 | 372 |
362 "punpckhwd %%mm5, %%mm4 # 00 R7 G7 B7 00 R6 B6 G6\n\t" | 373 "punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */ |
363 movntq " %%mm4, 24 (%3) # Store ARGB7 ARGB6\n\t" | 374 MOVNTQ " %%mm4, 24 (%3);" /* Store ARGB7 ARGB6 */ |
364 | 375 |
365 "movd 4 (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0\n\t" | 376 "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
366 "movd 4 (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0\n\t" | 377 "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
367 | 378 |
368 "pxor %%mm4, %%mm4 # zero mm4\n\t" | 379 "pxor %%mm4, %%mm4;" /* zero mm4 */ |
369 "movq 8 (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0\n\t" | 380 "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
370 | 381 |
371 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); | 382 : : "r" (py), "r" (pu), "r" (pv), "r" (image)); |
372 | 383 |
373 py += 8; | 384 py += 8; |
374 pu += 4; | 385 pu += 4; |
387 | 398 |
388 py += (y_stride - h_size); | 399 py += (y_stride - h_size); |
389 image += (rgb_stride - 4*h_size); | 400 image += (rgb_stride - 4*h_size); |
390 | 401 |
391 /* load data for start of next scan line */ | 402 /* load data for start of next scan line */ |
392 __asm__ | 403 __asm__ __volatile__ |
393 ( | 404 ( |
394 ".align 8 \n\t" | 405 ".align 8;" |
395 "movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0\n\t" | 406 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
396 "movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0\n\t" | 407 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
397 | 408 |
398 //"movl $0, (%3) # cache preload for image\n\t" | 409 //"movl $0, (%3);" /* cache preload for image */ |
399 "movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0\n\t" | 410 "movq (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
400 : : "r" (py), "r" (pu), "r" (pv), "r" (image) | 411 : : "r" (py), "r" (pu), "r" (pv), "r" (image) |
401 ); | 412 ); |
402 | 413 |
403 | 414 |
404 x = 0; | 415 x = 0; |
405 y += 1; | 416 y += 1; |
406 even = (!even); | 417 even = (!even); |
407 } while ( y < v_size) ; | 418 } while ( y < v_size) ; |
408 | 419 |
409 __asm__ ("emms\n\t"); | 420 __asm__ __volatile__ (EMMS); |
410 } | 421 } |
411 | 422 |
412 yuv2rgb_fun yuv2rgb_init_mmx (int bpp, int mode) | 423 yuv2rgb_fun yuv2rgb_init_mmx (int bpp, int mode) |
413 { | 424 { |
414 // if (bpp == 15 || bpp == 16) { | 425 // if (bpp == 15 || bpp == 16) { |