comparison postproc/swscale.c @ 2316:bcb229557e9b

fixed alignment (static variables where sometimes not 8-byte aligned) added half uv interpolation support added prefetch BGR15 support in MMX (untested) (so BGR15,16,24,32 are supported) special unscaled height version (not much faster but it doesnt interpolate uv vertically)
author michael
date Sat, 20 Oct 2001 21:12:09 +0000
parents 7a89cb124e81
children 7d3542955132
comparison
equal deleted inserted replaced
2315:c3c73ba53f0e 2316:bcb229557e9b
1 1
2 // Software scaling and colorspace conversion routines for MPlayer 2 // Software scaling and colorspace conversion routines for MPlayer
3 3
4 // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu> 4 // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
5 // current version mostly by Michael Niedermayer (michaelni@gmx.at) 5 // current version mostly by Michael Niedermayer (michaelni@gmx.at)
6 // the parts written by michael are under GNU GPL
6 7
7 #include <inttypes.h> 8 #include <inttypes.h>
8 #include "../config.h" 9 #include "../config.h"
10 #include "swscale.h"
9 11
10 //#undef HAVE_MMX2 12 //#undef HAVE_MMX2
11 //#undef HAVE_MMX 13 //#undef HAVE_MMX
12 //#undef ARCH_X86 14 //#undef ARCH_X86
13 #define DITHER16BPP 15 #define DITHER1XBPP
14 //#define ALT_ERROR 16 int fullUVIpol=0;
17 //disables the unscaled height version
18 int allwaysIpol=0;
15 19
16 #define RET 0xC3 //near return opcode 20 #define RET 0xC3 //near return opcode
17 /* 21 /*
18 NOTES 22 NOTES
19 23
20 known BUGS with known cause (no bugreports please!) 24 known BUGS with known cause (no bugreports please!, but patches are welcome :) )
21 code reads 1 sample too much (might cause a sig11) 25 horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11)
26
27 Supported output formats BGR15 BGR16 BGR24 BGR32 (15,24 are untested)
28 BGR15 & BGR16 MMX verions support dithering
29 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
22 30
23 TODO 31 TODO
24 check alignment off everything 32 more intelligent missalignment avoidance for the horizontal scaler
25 */ 33 */
26 34
27 static uint64_t yCoeff= 0x2568256825682568LL; 35 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
28 static uint64_t ubCoeff= 0x3343334333433343LL; 36
29 static uint64_t vrCoeff= 0x40cf40cf40cf40cfLL; 37 #ifdef HAVE_MMX2
30 static uint64_t ugCoeff= 0xE5E2E5E2E5E2E5E2LL; 38 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
31 static uint64_t vgCoeff= 0xF36EF36EF36EF36ELL; 39 #elif defined (HAVE_3DNOW)
32 static uint64_t w80= 0x0080008000800080LL; 40 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
33 static uint64_t w10= 0x0010001000100010LL; 41 #endif
34 static uint64_t bm00000111=0x0000000000FFFFFFLL; 42
35 static uint64_t bm11111000=0xFFFFFFFFFF000000LL; 43 #ifdef HAVE_MMX2
36 44 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
37 static uint64_t b16Dither= 0x0004000400040004LL; 45 #else
38 static uint64_t b16Dither1=0x0004000400040004LL; 46 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
39 static uint64_t b16Dither2=0x0602060206020602LL; 47 #endif
40 static uint64_t g16Dither= 0x0002000200020002LL; 48
41 static uint64_t g16Dither1=0x0002000200020002LL; 49
42 static uint64_t g16Dither2=0x0301030103010301LL; 50 #ifdef HAVE_MMX
43 51 static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL;
44 static uint64_t b16Mask= 0x001F001F001F001FLL; 52 static uint64_t __attribute__((aligned(8))) ubCoeff= 0x3343334333433343LL;
45 static uint64_t g16Mask= 0x07E007E007E007E0LL; 53 static uint64_t __attribute__((aligned(8))) vrCoeff= 0x40cf40cf40cf40cfLL;
46 static uint64_t r16Mask= 0xF800F800F800F800LL; 54 static uint64_t __attribute__((aligned(8))) ugCoeff= 0xE5E2E5E2E5E2E5E2LL;
47 static uint64_t temp0; 55 static uint64_t __attribute__((aligned(8))) vgCoeff= 0xF36EF36EF36EF36ELL;
48 56 static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL;
57 static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL;
58 static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
59 static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
60 static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
61 static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
62
63 static uint64_t __attribute__((aligned(8))) b16Dither= 0x0004000400040004LL;
64 static uint64_t __attribute__((aligned(8))) b16Dither1=0x0004000400040004LL;
65 static uint64_t __attribute__((aligned(8))) b16Dither2=0x0602060206020602LL;
66 static uint64_t __attribute__((aligned(8))) g16Dither= 0x0002000200020002LL;
67 static uint64_t __attribute__((aligned(8))) g16Dither1=0x0002000200020002LL;
68 static uint64_t __attribute__((aligned(8))) g16Dither2=0x0301030103010301LL;
69
70 static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;
71 static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;
72 static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;
73 static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;
74 static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;
75 static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;
76
77 static uint64_t __attribute__((aligned(8))) temp0;
78 static uint64_t __attribute__((aligned(8))) asm_yalpha1;
79 static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
80 #endif
49 81
50 // temporary storage for 4 yuv lines: 82 // temporary storage for 4 yuv lines:
51 // 16bit for now (mmx likes it more compact) 83 // 16bit for now (mmx likes it more compact)
84 #ifdef HAVE_MMX
85 static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048];
86 static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2];
87 #else
52 static uint16_t pix_buf_y[4][2048]; 88 static uint16_t pix_buf_y[4][2048];
53 static uint16_t pix_buf_uv[2][2048*2]; 89 static uint16_t pix_buf_uv[2][2048*2];
90 #endif
54 91
55 // clipping helper table for C implementations: 92 // clipping helper table for C implementations:
56 static unsigned char clip_table[768]; 93 static unsigned char clip_table[768];
57 94
58 // yuv->rgb conversion tables: 95 // yuv->rgb conversion tables:
63 static int yuvtab_40cf[256]; 100 static int yuvtab_40cf[256];
64 101
65 102
66 static uint8_t funnyYCode[10000]; 103 static uint8_t funnyYCode[10000];
67 static uint8_t funnyUVCode[10000]; 104 static uint8_t funnyUVCode[10000];
105
106 #define FULL_YSCALEYUV2RGB \
107 "pxor %%mm7, %%mm7 \n\t"\
108 "movd %6, %%mm6 \n\t" /*yalpha1*/\
109 "punpcklwd %%mm6, %%mm6 \n\t"\
110 "punpcklwd %%mm6, %%mm6 \n\t"\
111 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
112 "punpcklwd %%mm5, %%mm5 \n\t"\
113 "punpcklwd %%mm5, %%mm5 \n\t"\
114 "xorl %%eax, %%eax \n\t"\
115 "1: \n\t"\
116 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
117 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
118 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
119 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
120 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
121 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
122 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
123 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
124 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
125 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
126 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
127 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
128 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
129 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
130 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
131 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
132 "psubw w400, %%mm3 \n\t" /* 8(U-128)*/\
133 "pmulhw yCoeff, %%mm1 \n\t"\
134 \
135 \
136 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
137 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
138 "pmulhw ubCoeff, %%mm3 \n\t"\
139 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
140 "pmulhw ugCoeff, %%mm2 \n\t"\
141 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
142 "psubw w400, %%mm0 \n\t" /* (V-128)8*/\
143 \
144 \
145 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
146 "pmulhw vrCoeff, %%mm0 \n\t"\
147 "pmulhw vgCoeff, %%mm4 \n\t"\
148 "paddw %%mm1, %%mm3 \n\t" /* B*/\
149 "paddw %%mm1, %%mm0 \n\t" /* R*/\
150 "packuswb %%mm3, %%mm3 \n\t"\
151 \
152 "packuswb %%mm0, %%mm0 \n\t"\
153 "paddw %%mm4, %%mm2 \n\t"\
154 "paddw %%mm2, %%mm1 \n\t" /* G*/\
155 \
156 "packuswb %%mm1, %%mm1 \n\t"
157
158 #define YSCALEYUV2RGB \
159 "movd %6, %%mm6 \n\t" /*yalpha1*/\
160 "punpcklwd %%mm6, %%mm6 \n\t"\
161 "punpcklwd %%mm6, %%mm6 \n\t"\
162 "movq %%mm6, asm_yalpha1 \n\t"\
163 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
164 "punpcklwd %%mm5, %%mm5 \n\t"\
165 "punpcklwd %%mm5, %%mm5 \n\t"\
166 "movq %%mm5, asm_uvalpha1 \n\t"\
167 "xorl %%eax, %%eax \n\t"\
168 "1: \n\t"\
169 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
170 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
171 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
172 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
173 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
174 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
175 "movq asm_uvalpha1, %%mm0 \n\t"\
176 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
177 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
178 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
179 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
180 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
181 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
182 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
183 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
184 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
185 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
186 "pmulhw ugCoeff, %%mm3 \n\t"\
187 "pmulhw vgCoeff, %%mm4 \n\t"\
188 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
189 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
190 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
191 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
192 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
193 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
194 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
195 "pmulhw asm_yalpha1, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
196 "pmulhw asm_yalpha1, %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
197 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
198 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
199 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
200 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
201 "pmulhw ubCoeff, %%mm2 \n\t"\
202 "pmulhw vrCoeff, %%mm5 \n\t"\
203 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
204 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
205 "pmulhw yCoeff, %%mm1 \n\t"\
206 "pmulhw yCoeff, %%mm7 \n\t"\
207 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
208 "paddw %%mm3, %%mm4 \n\t"\
209 "movq %%mm2, %%mm0 \n\t"\
210 "movq %%mm5, %%mm6 \n\t"\
211 "movq %%mm4, %%mm3 \n\t"\
212 "punpcklwd %%mm2, %%mm2 \n\t"\
213 "punpcklwd %%mm5, %%mm5 \n\t"\
214 "punpcklwd %%mm4, %%mm4 \n\t"\
215 "paddw %%mm1, %%mm2 \n\t"\
216 "paddw %%mm1, %%mm5 \n\t"\
217 "paddw %%mm1, %%mm4 \n\t"\
218 "punpckhwd %%mm0, %%mm0 \n\t"\
219 "punpckhwd %%mm6, %%mm6 \n\t"\
220 "punpckhwd %%mm3, %%mm3 \n\t"\
221 "paddw %%mm7, %%mm0 \n\t"\
222 "paddw %%mm7, %%mm6 \n\t"\
223 "paddw %%mm7, %%mm3 \n\t"\
224 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
225 "packuswb %%mm0, %%mm2 \n\t"\
226 "packuswb %%mm6, %%mm5 \n\t"\
227 "packuswb %%mm3, %%mm4 \n\t"\
228 "pxor %%mm7, %%mm7 \n\t"
229
230 #define YSCALEYUV2RGB1 \
231 "xorl %%eax, %%eax \n\t"\
232 "1: \n\t"\
233 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
234 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
235 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
236 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
237 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
238 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
239 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
240 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
241 "pmulhw ugCoeff, %%mm3 \n\t"\
242 "pmulhw vgCoeff, %%mm4 \n\t"\
243 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
244 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
245 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
246 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
247 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
248 "pmulhw ubCoeff, %%mm2 \n\t"\
249 "pmulhw vrCoeff, %%mm5 \n\t"\
250 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
251 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
252 "pmulhw yCoeff, %%mm1 \n\t"\
253 "pmulhw yCoeff, %%mm7 \n\t"\
254 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
255 "paddw %%mm3, %%mm4 \n\t"\
256 "movq %%mm2, %%mm0 \n\t"\
257 "movq %%mm5, %%mm6 \n\t"\
258 "movq %%mm4, %%mm3 \n\t"\
259 "punpcklwd %%mm2, %%mm2 \n\t"\
260 "punpcklwd %%mm5, %%mm5 \n\t"\
261 "punpcklwd %%mm4, %%mm4 \n\t"\
262 "paddw %%mm1, %%mm2 \n\t"\
263 "paddw %%mm1, %%mm5 \n\t"\
264 "paddw %%mm1, %%mm4 \n\t"\
265 "punpckhwd %%mm0, %%mm0 \n\t"\
266 "punpckhwd %%mm6, %%mm6 \n\t"\
267 "punpckhwd %%mm3, %%mm3 \n\t"\
268 "paddw %%mm7, %%mm0 \n\t"\
269 "paddw %%mm7, %%mm6 \n\t"\
270 "paddw %%mm7, %%mm3 \n\t"\
271 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
272 "packuswb %%mm0, %%mm2 \n\t"\
273 "packuswb %%mm6, %%mm5 \n\t"\
274 "packuswb %%mm3, %%mm4 \n\t"\
275 "pxor %%mm7, %%mm7 \n\t"
276
277 #define WRITEBGR32 \
278 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
279 "movq %%mm2, %%mm1 \n\t" /* B */\
280 "movq %%mm5, %%mm6 \n\t" /* R */\
281 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
282 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
283 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
284 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
285 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
286 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
287 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
288 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
289 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
290 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
291 \
292 MOVNTQ(%%mm0, (%4, %%eax, 4))\
293 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
294 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
295 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
296 \
297 "addl $8, %%eax \n\t"\
298 "cmpl %5, %%eax \n\t"\
299 " jb 1b \n\t"
300
301 #define WRITEBGR16 \
302 "movq %%mm2, %%mm1 \n\t" /* B */\
303 "movq %%mm4, %%mm3 \n\t" /* G */\
304 "movq %%mm5, %%mm6 \n\t" /* R */\
305 \
306 "punpcklbw %%mm7, %%mm3 \n\t" /* 0G0G0G0G */\
307 "punpcklbw %%mm7, %%mm2 \n\t" /* 0B0B0B0B */\
308 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R */\
309 \
310 "psrlw $3, %%mm2 \n\t"\
311 "psllw $3, %%mm3 \n\t"\
312 "psllw $8, %%mm5 \n\t"\
313 \
314 "pand g16Mask, %%mm3 \n\t"\
315 "pand r16Mask, %%mm5 \n\t"\
316 \
317 "por %%mm3, %%mm2 \n\t"\
318 "por %%mm5, %%mm2 \n\t"\
319 \
320 "punpckhbw %%mm7, %%mm4 \n\t" /* 0G0G0G0G */\
321 "punpckhbw %%mm7, %%mm1 \n\t" /* 0B0B0B0B */\
322 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R */\
323 \
324 "psrlw $3, %%mm1 \n\t"\
325 "psllw $3, %%mm4 \n\t"\
326 "psllw $8, %%mm6 \n\t"\
327 \
328 "pand g16Mask, %%mm4 \n\t"\
329 "pand r16Mask, %%mm6 \n\t"\
330 \
331 "por %%mm4, %%mm1 \n\t"\
332 "por %%mm6, %%mm1 \n\t"\
333 \
334 MOVNTQ(%%mm2, (%4, %%eax, 2))\
335 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
336 \
337 "addl $8, %%eax \n\t"\
338 "cmpl %5, %%eax \n\t"\
339 " jb 1b \n\t"
340
341 #define WRITEBGR15 \
342 "movq %%mm2, %%mm1 \n\t" /* B */\
343 "movq %%mm4, %%mm3 \n\t" /* G */\
344 "movq %%mm5, %%mm6 \n\t" /* R */\
345 \
346 "punpcklbw %%mm7, %%mm3 \n\t" /* 0G0G0G0G */\
347 "punpcklbw %%mm7, %%mm2 \n\t" /* 0B0B0B0B */\
348 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R */\
349 \
350 "psrlw $3, %%mm2 \n\t"\
351 "psllw $2, %%mm3 \n\t"\
352 "psllw $7, %%mm5 \n\t"\
353 \
354 "pand g15Mask, %%mm3 \n\t"\
355 "pand r15Mask, %%mm5 \n\t"\
356 \
357 "por %%mm3, %%mm2 \n\t"\
358 "por %%mm5, %%mm2 \n\t"\
359 \
360 "punpckhbw %%mm7, %%mm4 \n\t" /* 0G0G0G0G */\
361 "punpckhbw %%mm7, %%mm1 \n\t" /* 0B0B0B0B */\
362 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R */\
363 \
364 "psrlw $3, %%mm1 \n\t"\
365 "psllw $2, %%mm4 \n\t"\
366 "psllw $7, %%mm6 \n\t"\
367 \
368 "pand g15Mask, %%mm4 \n\t"\
369 "pand r15Mask, %%mm6 \n\t"\
370 \
371 "por %%mm4, %%mm1 \n\t"\
372 "por %%mm6, %%mm1 \n\t"\
373 \
374 MOVNTQ(%%mm2, (%4, %%eax, 2))\
375 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
376 \
377 "addl $8, %%eax \n\t"\
378 "cmpl %5, %%eax \n\t"\
379 " jb 1b \n\t"
380 // FIXME find a faster way to shuffle it to BGR24
381 #define WRITEBGR24 \
382 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
383 "movq %%mm2, %%mm1 \n\t" /* B */\
384 "movq %%mm5, %%mm6 \n\t" /* R */\
385 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
386 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
387 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
388 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
389 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
390 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
391 "punpcklbw %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
392 "punpckhbw %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
393 "punpcklbw %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
394 "punpckhbw %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
395 \
396 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
397 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
398 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 0 */\
399 "pand bm11111000, %%mm0 \n\t" /* 00RGB000 0.5 */\
400 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
401 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
402 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
403 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
404 \
405 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
406 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
407 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
408 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
409 "pand bm00001111, %%mm2 \n\t" /* 0000RGBR 1 */\
410 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
411 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
412 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 2 */\
413 "pand bm11111000, %%mm1 \n\t" /* 00RGB000 2.5 */\
414 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
415 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
416 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
417 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
418 \
419 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
420 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
421 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
422 "pand bm00000111, %%mm5 \n\t" /* 00000RGB 3 */\
423 "pand bm11111000, %%mm3 \n\t" /* 00RGB000 3.5 */\
424 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
425 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
426 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
427 \
428 "leal (%%eax, %%eax, 2), %%ebx \n\t"\
429 MOVNTQ(%%mm0, (%4, %%ebx))\
430 MOVNTQ(%%mm2, 8(%4, %%ebx))\
431 MOVNTQ(%%mm3, 16(%4, %%ebx))\
432 \
433 "addl $8, %%eax \n\t"\
434 "cmpl %5, %%eax \n\t"\
435 " jb 1b \n\t"
436
437
438 /**
439 * vertical scale YV12 to RGB
440 */
441 static inline void yuv2rgbX(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
442 uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
443 {
444 int yalpha1=yalpha^4095;
445 int uvalpha1=uvalpha^4095;
446 int i;
447
448 if(fullUVIpol)
449 {
450
451 #ifdef HAVE_MMX
452 if(dstbpp == 32)
453 {
454 asm volatile(
455
456
457 FULL_YSCALEYUV2RGB
458 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
459 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
460
461 "movq %%mm3, %%mm1 \n\t"
462 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
463 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
464
465 MOVNTQ(%%mm3, (%4, %%eax, 4))
466 MOVNTQ(%%mm1, 8(%4, %%eax, 4))
467
468 "addl $4, %%eax \n\t"
469 "cmpl %5, %%eax \n\t"
470 " jb 1b \n\t"
471
472
473 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
474 "m" (yalpha1), "m" (uvalpha1)
475 : "%eax"
476 );
477 }
478 else if(dstbpp==24)
479 {
480 asm volatile(
481
482 FULL_YSCALEYUV2RGB
483
484 // lsb ... msb
485 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
486 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
487
488 "movq %%mm3, %%mm1 \n\t"
489 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
490 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
491
492 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
493 "psrlq $8, %%mm3 \n\t" // GR0BGR00
494 "pand bm00000111, %%mm2 \n\t" // BGR00000
495 "pand bm11111000, %%mm3 \n\t" // 000BGR00
496 "por %%mm2, %%mm3 \n\t" // BGRBGR00
497 "movq %%mm1, %%mm2 \n\t"
498 "psllq $48, %%mm1 \n\t" // 000000BG
499 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
500
501 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
502 "psrld $16, %%mm2 \n\t" // R000R000
503 "psrlq $24, %%mm1 \n\t" // 0BGR0000
504 "por %%mm2, %%mm1 \n\t" // RBGRR000
505
506 "movl %4, %%ebx \n\t"
507 "addl %%eax, %%ebx \n\t"
508
509 #ifdef HAVE_MMX2
510 //FIXME Alignment
511 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
512 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
513 #else
514 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
515 "psrlq $32, %%mm3 \n\t"
516 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
517 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
518 #endif
519 "addl $4, %%eax \n\t"
520 "cmpl %5, %%eax \n\t"
521 " jb 1b \n\t"
522
523 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
524 "m" (yalpha1), "m" (uvalpha1)
525 : "%eax", "%ebx"
526 );
527 }
528 else if(dstbpp==15)
529 {
530 asm volatile(
531
532 FULL_YSCALEYUV2RGB
533 #ifdef DITHER1XBPP
534 "paddusb b16Dither, %%mm1 \n\t"
535 "paddusb b16Dither, %%mm0 \n\t"
536 "paddusb b16Dither, %%mm3 \n\t"
537 #endif
538 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
539 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
540 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
541
542 "psrlw $3, %%mm3 \n\t"
543 "psllw $2, %%mm1 \n\t"
544 "psllw $7, %%mm0 \n\t"
545 "pand g15Mask, %%mm1 \n\t"
546 "pand r15Mask, %%mm0 \n\t"
547
548 "por %%mm3, %%mm1 \n\t"
549 "por %%mm1, %%mm0 \n\t"
550
551 MOVNTQ(%%mm0, (%4, %%eax, 2))
552
553 "addl $4, %%eax \n\t"
554 "cmpl %5, %%eax \n\t"
555 " jb 1b \n\t"
556
557 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
558 "m" (yalpha1), "m" (uvalpha1)
559 : "%eax"
560 );
561 }
562 else if(dstbpp==16)
563 {
564 asm volatile(
565
566 FULL_YSCALEYUV2RGB
567 #ifdef DITHER1XBPP
568 "paddusb g16Dither, %%mm1 \n\t"
569 "paddusb b16Dither, %%mm0 \n\t"
570 "paddusb b16Dither, %%mm3 \n\t"
571 #endif
572 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
573 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
574 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
575
576 "psrlw $3, %%mm3 \n\t"
577 "psllw $3, %%mm1 \n\t"
578 "psllw $8, %%mm0 \n\t"
579 "pand g16Mask, %%mm1 \n\t"
580 "pand r16Mask, %%mm0 \n\t"
581
582 "por %%mm3, %%mm1 \n\t"
583 "por %%mm1, %%mm0 \n\t"
584
585 MOVNTQ(%%mm0, (%4, %%eax, 2))
586
587 "addl $4, %%eax \n\t"
588 "cmpl %5, %%eax \n\t"
589 " jb 1b \n\t"
590
591 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
592 "m" (yalpha1), "m" (uvalpha1)
593 : "%eax"
594 );
595 }
596 #else
597 if(dstbpp==32 || dstbpp==24)
598 {
599 for(i=0;i<dstw;i++){
600 // vertical linear interpolation && yuv2rgb in a single step:
601 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
602 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
603 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
604 dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];
605 dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
606 dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];
607 dest+=dstbpp>>3;
608 }
609 }
610 else if(dstbpp==16)
611 {
612 for(i=0;i<dstw;i++){
613 // vertical linear interpolation && yuv2rgb in a single step:
614 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
615 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
616 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
617
618 ((uint16_t*)dest)[0] =
619 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
620 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 |
621 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;
622 dest+=2;
623 }
624 }
625 else if(dstbpp==15)
626 {
627 for(i=0;i<dstw;i++){
628 // vertical linear interpolation && yuv2rgb in a single step:
629 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
630 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
631 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
632
633 ((uint16_t*)dest)[0] =
634 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
635 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 |
636 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;
637 dest+=2;
638 }
639 }
640 #endif
641 }//FULL_UV_IPOL
642 else
643 {
644 #ifdef HAVE_MMX
645 if(dstbpp == 32)
646 {
647 asm volatile(
648 YSCALEYUV2RGB
649 WRITEBGR32
650
651 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
652 "m" (yalpha1), "m" (uvalpha1)
653 : "%eax"
654 );
655 }
656 else if(dstbpp==24)
657 {
658 asm volatile(
659 YSCALEYUV2RGB
660 WRITEBGR24
661
662 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
663 "m" (yalpha1), "m" (uvalpha1)
664 : "%eax", "%ebx"
665 );
666 }
667 else if(dstbpp==15)
668 {
669 asm volatile(
670 YSCALEYUV2RGB
671 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
672 #ifdef DITHER1XBPP
673 "paddusb b16Dither, %%mm2 \n\t"
674 "paddusb b16Dither, %%mm4 \n\t"
675 "paddusb b16Dither, %%mm5 \n\t"
676 #endif
677
678 WRITEBGR15
679
680 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
681 "m" (yalpha1), "m" (uvalpha1)
682 : "%eax"
683 );
684 }
685 else if(dstbpp==16)
686 {
687 asm volatile(
688 YSCALEYUV2RGB
689 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
690 #ifdef DITHER1XBPP
691 "paddusb g16Dither, %%mm2 \n\t"
692 "paddusb b16Dither, %%mm4 \n\t"
693 "paddusb b16Dither, %%mm5 \n\t"
694 #endif
695
696 WRITEBGR16
697
698 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
699 "m" (yalpha1), "m" (uvalpha1)
700 : "%eax"
701 );
702 }
703 #else
704 //FIXME unroll C loop and dont recalculate UV
705 if(dstbpp==32 || dstbpp==24)
706 {
707 for(i=0;i<dstw;i++){
708 // vertical linear interpolation && yuv2rgb in a single step:
709 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
710 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
711 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
712 dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];
713 dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
714 dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];
715 dest+=dstbpp>>3;
716 }
717 }
718 else if(dstbpp==16)
719 {
720 for(i=0;i<dstw;i++){
721 // vertical linear interpolation && yuv2rgb in a single step:
722 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
723 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
724 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
725
726 ((uint16_t*)dest)[0] =
727 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
728 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 |
729 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;
730 dest+=2;
731 }
732 }
733 else if(dstbpp==15)
734 {
735 for(i=0;i<dstw;i++){
736 // vertical linear interpolation && yuv2rgb in a single step:
737 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
738 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
739 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
740
741 ((uint16_t*)dest)[0] =
742 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
743 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 |
744 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;
745 dest+=2;
746 }
747 }
748 #endif
749 } //!FULL_UV_IPOL
750 }
751
752 /**
753 * YV12 to RGB without scaling or interpolating
754 */
755 static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
756 uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
757 {
758 int yalpha1=yalpha^4095;
759 int uvalpha1=uvalpha^4095;
760 int i;
761 if(fullUVIpol || allwaysIpol)
762 {
763 yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
764 return;
765 }
766 #ifdef HAVE_MMX
767 if(dstbpp == 32)
768 {
769 asm volatile(
770 YSCALEYUV2RGB1
771 WRITEBGR32
772 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
773 "m" (yalpha1), "m" (uvalpha1)
774 : "%eax"
775 );
776 }
777 else if(dstbpp==24)
778 {
779 asm volatile(
780 YSCALEYUV2RGB1
781 WRITEBGR24
782 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
783 "m" (yalpha1), "m" (uvalpha1)
784 : "%eax", "%ebx"
785 );
786 }
787 else if(dstbpp==15)
788 {
789 asm volatile(
790 YSCALEYUV2RGB1
791 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
792 #ifdef DITHER1XBPP
793 "paddusb b16Dither, %%mm2 \n\t"
794 "paddusb b16Dither, %%mm4 \n\t"
795 "paddusb b16Dither, %%mm5 \n\t"
796 #endif
797 WRITEBGR15
798 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
799 "m" (yalpha1), "m" (uvalpha1)
800 : "%eax"
801 );
802 }
803 else if(dstbpp==16)
804 {
805 asm volatile(
806 YSCALEYUV2RGB1
807 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
808 #ifdef DITHER1XBPP
809 "paddusb g16Dither, %%mm2 \n\t"
810 "paddusb b16Dither, %%mm4 \n\t"
811 "paddusb b16Dither, %%mm5 \n\t"
812 #endif
813
814 WRITEBGR16
815 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
816 "m" (yalpha1), "m" (uvalpha1)
817 : "%eax"
818 );
819 }
820 #else
821 //FIXME unroll C loop and dont recalculate UV
822 if(dstbpp==32 || dstbpp==24)
823 {
824 for(i=0;i<dstw;i++){
825 // vertical linear interpolation && yuv2rgb in a single step:
826 int Y=yuvtab_2568[buf0[i]>>7];
827 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
828 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
829 dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];
830 dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
831 dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];
832 dest+=dstbpp>>3;
833 }
834 }
835 else if(dstbpp==16)
836 {
837 for(i=0;i<dstw;i++){
838 // vertical linear interpolation && yuv2rgb in a single step:
839 int Y=yuvtab_2568[buf0[i]>>7];
840 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
841 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
842
843 ((uint16_t*)dest)[0] =
844 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
845 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 |
846 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;
847 dest+=2;
848 }
849 }
850 else if(dstbpp==15)
851 {
852 for(i=0;i<dstw;i++){
853 // vertical linear interpolation && yuv2rgb in a single step:
854 int Y=yuvtab_2568[buf0[i]>>7];
855 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
856 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
857
858 ((uint16_t*)dest)[0] =
859 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
860 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 |
861 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;
862 dest+=2;
863 }
864 }
865 #endif
866 }
867
868
68 869
69 870
70 // *** bilinear scaling and yuv->rgb conversion of yv12 slices: 871 // *** bilinear scaling and yuv->rgb conversion of yv12 slices:
71 // *** Note: it's called multiple times while decoding a frame, first time y==0 872 // *** Note: it's called multiple times while decoding a frame, first time y==0
72 // *** Designed to upscale, but may work for downscale too. 873 // *** Designed to upscale, but may work for downscale too.
93 894
94 #ifdef HAVE_MMX2 895 #ifdef HAVE_MMX2
95 // used to detect a horizontal size change 896 // used to detect a horizontal size change
96 static int old_dstw= -1; 897 static int old_dstw= -1;
97 static int old_s_xinc= -1; 898 static int old_s_xinc= -1;
98 899 #endif
99 #endif 900
100 int canMMX2BeUsed=0; 901 int canMMX2BeUsed=0;
101 int srcWidth= (dstw*s_xinc + 0x8000)>>16; 902 int srcWidth= (dstw*s_xinc + 0x8000)>>16;
903 int dstUVw= fullUVIpol ? dstw : dstw/2;
904
102 905
103 #ifdef HAVE_MMX2 906 #ifdef HAVE_MMX2
104 canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0 && (srcWidth&15)==0) ? 1 : 0; 907 canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0 && (srcWidth&15)==0) ? 1 : 0;
105 #endif 908 #endif
106 909
109 // FIXME this is not perfect, but noone shuld notice the difference, the more correct variant 912 // FIXME this is not perfect, but noone shuld notice the difference, the more correct variant
110 // would be like the vertical one, but that would require some special code for the 913 // would be like the vertical one, but that would require some special code for the
111 // first and last pixel 914 // first and last pixel
112 if(canMMX2BeUsed) s_xinc+= 20; 915 if(canMMX2BeUsed) s_xinc+= 20;
113 else s_xinc = ((srcWidth-2)<<16)/(dstw-2) - 20; 916 else s_xinc = ((srcWidth-2)<<16)/(dstw-2) - 20;
114 s_xinc2=s_xinc>>1; 917
115 918 if(fullUVIpol) s_xinc2= s_xinc>>1;
919 else s_xinc2= s_xinc;
116 // force calculation of the horizontal interpolation of the first line 920 // force calculation of the horizontal interpolation of the first line
117 s_last_ypos=-99; 921 s_last_ypos=-99;
118 s_last_y1pos=-99; 922 s_last_y1pos=-99;
119 923
120 if(y==0){ 924 if(y==0){
213 1017
214 funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]= 1018 funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]=
215 funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]= 1019 funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]=
216 a | (b<<2) | (c<<4) | (d<<6); 1020 a | (b<<2) | (c<<4) | (d<<6);
217 1021
1022 // if we dont need to read 8 bytes than dont :), reduces the chance of
1023 // crossing a cache line
1024 if(d<3) funnyYCode[fragmentLength*i/4 + 1]= 0x6E;
1025
218 funnyYCode[fragmentLength*(i+4)/4]= RET; 1026 funnyYCode[fragmentLength*(i+4)/4]= RET;
219 } 1027 }
220 xpos+=s_xinc; 1028 xpos+=s_xinc;
221 } 1029 }
222 1030
223 xpos= 0; //s_xinc2/2 - 0x10000; // difference between centers of chrom samples 1031 xpos= 0; //s_xinc2/2 - 0x10000; // difference between centers of chrom samples
224 for(i=0; i<dstw/8; i++) 1032 for(i=0; i<dstUVw/8; i++)
225 { 1033 {
226 int xx=xpos>>16; 1034 int xx=xpos>>16;
227 1035
228 if((i&3) == 0) 1036 if((i&3) == 0)
229 { 1037 {
236 1044
237 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]= 1045 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
238 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]= 1046 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
239 a | (b<<2) | (c<<4) | (d<<6); 1047 a | (b<<2) | (c<<4) | (d<<6);
240 1048
1049 // if we dont need to read 8 bytes than dont :), reduces the chance of
1050 // crossing a cache line
1051 if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E;
1052
241 funnyUVCode[fragmentLength*(i+4)/4]= RET; 1053 funnyUVCode[fragmentLength*(i+4)/4]= RET;
242 } 1054 }
243 xpos+=s_xinc2; 1055 xpos+=s_xinc2;
244 } 1056 }
245 // funnyCode[0]= RET; 1057 // funnyCode[0]= RET;
253 unsigned char *dest=dstptr+dststride*s_ypos; 1065 unsigned char *dest=dstptr+dststride*s_ypos;
254 int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line 1066 int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line
255 // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) 1067 // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
256 int srcuvpos= s_srcypos + s_yinc/2 - 0x8000; 1068 int srcuvpos= s_srcypos + s_yinc/2 - 0x8000;
257 int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line 1069 int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line
258 int yalpha=((s_srcypos-1)&0xFFFF)>>7; 1070 int yalpha=((s_srcypos-1)&0xFFFF)>>4;
259 int yalpha1=yalpha^511; 1071 int uvalpha=((srcuvpos-1)&0x1FFFF)>>5;
260 int uvalpha=((srcuvpos-1)&0x1FFFF)>>8;
261 int uvalpha1=uvalpha^511;
262 uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice 1072 uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice
263 uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice 1073 uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice
264 uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice 1074 uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice
265 uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice 1075 uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice
266 int i; 1076 int i;
318 "movl %1, %%edi \n\t" // buf1 1128 "movl %1, %%edi \n\t" // buf1
319 "movl %3, %%edx \n\t" // (s_xinc*4)>>16 1129 "movl %3, %%edx \n\t" // (s_xinc*4)>>16
320 "xorl %%ecx, %%ecx \n\t" 1130 "xorl %%ecx, %%ecx \n\t"
321 "xorl %%ebx, %%ebx \n\t" 1131 "xorl %%ebx, %%ebx \n\t"
322 "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF 1132 "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF
323 // "int $3\n\t" 1133 #ifdef HAVE_MMX2
324 "call funnyYCode \n\t" 1134 #define FUNNY_Y_CODE \
325 "movq temp0, %%mm2 \n\t" 1135 "prefetchnta 1024(%%esi) \n\t"\
1136 "prefetchnta 1056(%%esi) \n\t"\
1137 "prefetchnta 1088(%%esi) \n\t"\
1138 "call funnyYCode \n\t"\
1139 "movq temp0, %%mm2 \n\t"\
326 "xorl %%ecx, %%ecx \n\t" 1140 "xorl %%ecx, %%ecx \n\t"
327 "call funnyYCode \n\t" 1141 #else
328 "movq temp0, %%mm2 \n\t" 1142 #define FUNNY_Y_CODE \
1143 "call funnyYCode \n\t"\
1144 "movq temp0, %%mm2 \n\t"\
329 "xorl %%ecx, %%ecx \n\t" 1145 "xorl %%ecx, %%ecx \n\t"
330 "call funnyYCode \n\t" 1146 #endif
331 "movq temp0, %%mm2 \n\t" 1147 FUNNY_Y_CODE
332 "xorl %%ecx, %%ecx \n\t" 1148 FUNNY_Y_CODE
333 "call funnyYCode \n\t" 1149 FUNNY_Y_CODE
334 "movq temp0, %%mm2 \n\t" 1150 FUNNY_Y_CODE
335 "xorl %%ecx, %%ecx \n\t" 1151 FUNNY_Y_CODE
336 "call funnyYCode \n\t" 1152 FUNNY_Y_CODE
337 "movq temp0, %%mm2 \n\t" 1153 FUNNY_Y_CODE
338 "xorl %%ecx, %%ecx \n\t" 1154 FUNNY_Y_CODE
339 "call funnyYCode \n\t" 1155
340 "movq temp0, %%mm2 \n\t"
341 "xorl %%ecx, %%ecx \n\t"
342 "call funnyYCode \n\t"
343 "movq temp0, %%mm2 \n\t"
344 "xorl %%ecx, %%ecx \n\t"
345 "call funnyYCode \n\t"
346 :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16), 1156 :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16),
347 "m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF) 1157 "m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF)
348 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" 1158 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
349 ); 1159 );
350 for(i=dstw-1; (i*s_xinc)>>16 >=srcWidth-1; i--) buf1[i] = src[srcWidth-1]*128; 1160 for(i=dstw-1; (i*s_xinc)>>16 >=srcWidth-1; i--) buf1[i] = src[srcWidth-1]*128;
351 } 1161 }
352 else 1162 else
353 { 1163 {
354 #endif 1164 #endif
355 //NO MMX just normal asm ... FIXME try/write funny MMX2 variant 1165 //NO MMX just normal asm ...
356 //FIXME add prefetch
357 asm volatile( 1166 asm volatile(
358 "xorl %%eax, %%eax \n\t" // i 1167 "xorl %%eax, %%eax \n\t" // i
359 "xorl %%ebx, %%ebx \n\t" // xx 1168 "xorl %%ebx, %%ebx \n\t" // xx
360 "xorl %%ecx, %%ecx \n\t" // 2*xalpha 1169 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
361 "1: \n\t" 1170 "1: \n\t"
436 "movl %3, %%edx \n\t" // (s_xinc*4)>>16 1245 "movl %3, %%edx \n\t" // (s_xinc*4)>>16
437 "xorl %%ecx, %%ecx \n\t" 1246 "xorl %%ecx, %%ecx \n\t"
438 "xorl %%ebx, %%ebx \n\t" 1247 "xorl %%ebx, %%ebx \n\t"
439 "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF 1248 "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF
440 1249
441 // "int $3\n\t" 1250 #ifdef HAVE_MMX2
442 #define FUNNYUVCODE \ 1251 #define FUNNYUVCODE \
443 "call funnyUVCode \n\t"\ 1252 "prefetchnta 1024(%%esi) \n\t"\
444 "movq temp0, %%mm2 \n\t"\ 1253 "prefetchnta 1056(%%esi) \n\t"\
445 "xorl %%ecx, %%ecx \n\t" 1254 "prefetchnta 1088(%%esi) \n\t"\
446 1255 "call funnyUVCode \n\t"\
447 FUNNYUVCODE 1256 "movq temp0, %%mm2 \n\t"\
448 FUNNYUVCODE 1257 "xorl %%ecx, %%ecx \n\t"
449 FUNNYUVCODE 1258 #else
450 FUNNYUVCODE 1259 #define FUNNYUVCODE \
451 1260 "call funnyUVCode \n\t"\
452 FUNNYUVCODE 1261 "movq temp0, %%mm2 \n\t"\
453 FUNNYUVCODE 1262 "xorl %%ecx, %%ecx \n\t"
454 FUNNYUVCODE 1263 #endif
455 FUNNYUVCODE 1264
456 1265 FUNNYUVCODE
1266 FUNNYUVCODE
1267 FUNNYUVCODE
1268 FUNNYUVCODE
1269
1270 FUNNYUVCODE
1271 FUNNYUVCODE
1272 FUNNYUVCODE
1273 FUNNYUVCODE
457 1274
458 1275
459 "xorl %%eax, %%eax \n\t" // i 1276 "xorl %%eax, %%eax \n\t" // i
460 "movl %6, %%esi \n\t" // src 1277 "movl %6, %%esi \n\t" // src
461 "movl %1, %%edi \n\t" // buf1 1278 "movl %1, %%edi \n\t" // buf1
469 FUNNYUVCODE 1286 FUNNYUVCODE
470 FUNNYUVCODE 1287 FUNNYUVCODE
471 FUNNYUVCODE 1288 FUNNYUVCODE
472 FUNNYUVCODE 1289 FUNNYUVCODE
473 1290
474 :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>16), 1291 :: "m" (src1), "m" (uvbuf1), "m" (dstUVw), "m" ((s_xinc2*4)>>16),
475 "m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2) 1292 "m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2)
476 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" 1293 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
477 ); 1294 );
478 for(i=dstw-1; (i*s_xinc2)>>16 >=srcWidth/2-1; i--) 1295 for(i=dstUVw-1; (i*s_xinc2)>>16 >=srcWidth/2-1; i--)
479 { 1296 {
480 uvbuf1[i] = src1[srcWidth/2-1]*128; 1297 uvbuf1[i] = src1[srcWidth/2-1]*128;
481 uvbuf1[i+2048] = src2[srcWidth/2-1]*128; 1298 uvbuf1[i+2048] = src2[srcWidth/2-1]*128;
482 } 1299 }
483 } 1300 }
514 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry 1331 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry
515 "addl $1, %%eax \n\t" 1332 "addl $1, %%eax \n\t"
516 "cmpl %2, %%eax \n\t" 1333 "cmpl %2, %%eax \n\t"
517 " jb 1b \n\t" 1334 " jb 1b \n\t"
518 1335
519 1336 :: "m" (src1), "m" (uvbuf1), "m" (dstUVw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF),
520 :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF),
521 "r" (src2) 1337 "r" (src2)
522 : "%eax", "%ebx", "%ecx", "%edi", "%esi" 1338 : "%eax", "%ebx", "%ecx", "%edi", "%esi"
523 ); 1339 );
524 #ifdef HAVE_MMX2 1340 #ifdef HAVE_MMX2
525 } //if MMX2 cant be used 1341 } //if MMX2 cant be used
526 #endif 1342 #endif
527 #else 1343 #else
528 for(i=0;i<dstw;i++){ 1344 for(i=0;i<dstUVw;i++){
529 register unsigned int xx=xpos>>16; 1345 register unsigned int xx=xpos>>16;
530 register unsigned int xalpha=(xpos&0xFFFF)>>9; 1346 register unsigned int xalpha=(xpos&0xFFFF)>>9;
531 uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); 1347 uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
532 uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); 1348 uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
533 xpos+=s_xinc2; 1349 xpos+=s_xinc2;
539 s_srcypos= s_yinc/2 - 0x8000; 1355 s_srcypos= s_yinc/2 - 0x8000;
540 continue; 1356 continue;
541 } 1357 }
542 } 1358 }
543 1359
544 1360 if(ABS(s_yinc - 0x10000) < 10)
545 // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization... 1361 yuv2rgb1(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
546 // Re: Note1: ok n*4 for now 1362 else
547 // Note2: instead of using lookup tabs, mmx version could do the multiply... 1363 yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
548 // Re: Note2: yep 1364
549 // Note3: maybe we should make separated 15/16, 24 and 32bpp version of this:
550 // Re: done (32 & 16) and 16 has dithering :) but 16 is untested
551 #ifdef HAVE_MMX 1365 #ifdef HAVE_MMX
552 //FIXME write lq version with less uv ... 1366 b16Dither= b16Dither1;
553 //FIXME reorder / optimize
554 if(dstbpp == 32)
555 {
556 asm volatile(
557
558 #define YSCALEYUV2RGB \
559 "pxor %%mm7, %%mm7 \n\t"\
560 "movd %6, %%mm6 \n\t" /*yalpha1*/\
561 "punpcklwd %%mm6, %%mm6 \n\t"\
562 "punpcklwd %%mm6, %%mm6 \n\t"\
563 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
564 "punpcklwd %%mm5, %%mm5 \n\t"\
565 "punpcklwd %%mm5, %%mm5 \n\t"\
566 "xorl %%eax, %%eax \n\t"\
567 "1: \n\t"\
568 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
569 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
570 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
571 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
572 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
573 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
574 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
575 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
576 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\
577 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
578 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\
579 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
580 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
581 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
582 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
583 "psubw w10, %%mm1 \n\t" /* Y-16*/\
584 "psubw w80, %%mm3 \n\t" /* (U-128)*/\
585 "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\
586 "psllw $3, %%mm3 \n\t" /*(U-128)8*/\
587 "pmulhw yCoeff, %%mm1 \n\t"\
588 \
589 \
590 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
591 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
592 "pmulhw ubCoeff, %%mm3 \n\t"\
593 "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\
594 "pmulhw ugCoeff, %%mm2 \n\t"\
595 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
596 "psubw w80, %%mm0 \n\t" /* (V-128)*/\
597 "psllw $3, %%mm0 \n\t" /* (V-128)8*/\
598 \
599 \
600 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
601 "pmulhw vrCoeff, %%mm0 \n\t"\
602 "pmulhw vgCoeff, %%mm4 \n\t"\
603 "paddw %%mm1, %%mm3 \n\t" /* B*/\
604 "paddw %%mm1, %%mm0 \n\t" /* R*/\
605 "packuswb %%mm3, %%mm3 \n\t"\
606 \
607 "packuswb %%mm0, %%mm0 \n\t"\
608 "paddw %%mm4, %%mm2 \n\t"\
609 "paddw %%mm2, %%mm1 \n\t" /* G*/\
610 \
611 "packuswb %%mm1, %%mm1 \n\t"
612
613 YSCALEYUV2RGB
614 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
615 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
616
617 "movq %%mm3, %%mm1 \n\t"
618 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
619 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
620 #ifdef HAVE_MMX2
621 "movntq %%mm3, (%4, %%eax, 4) \n\t"
622 "movntq %%mm1, 8(%4, %%eax, 4) \n\t"
623 #else
624 "movq %%mm3, (%4, %%eax, 4) \n\t"
625 "movq %%mm1, 8(%4, %%eax, 4) \n\t"
626 #endif
627 "addl $4, %%eax \n\t"
628 "cmpl %5, %%eax \n\t"
629 " jb 1b \n\t"
630
631
632 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
633 "m" (yalpha1), "m" (uvalpha1)
634 : "%eax"
635 );
636 }
637 else if(dstbpp==24)
638 {
639 asm volatile(
640
641 YSCALEYUV2RGB
642
643 // lsb ... msb
644 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
645 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
646
647 "movq %%mm3, %%mm1 \n\t"
648 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
649 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
650
651 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
652 "psrlq $8, %%mm3 \n\t" // GR0BGR00
653 "pand bm00000111, %%mm2 \n\t" // BGR00000
654 "pand bm11111000, %%mm3 \n\t" // 000BGR00
655 "por %%mm2, %%mm3 \n\t" // BGRBGR00
656 "movq %%mm1, %%mm2 \n\t"
657 "psllq $48, %%mm1 \n\t" // 000000BG
658 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
659
660 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
661 "psrld $16, %%mm2 \n\t" // R000R000
662 "psrlq $24, %%mm1 \n\t" // 0BGR0000
663 "por %%mm2, %%mm1 \n\t" // RBGRR000
664
665 "movl %4, %%ebx \n\t"
666 "addl %%eax, %%ebx \n\t"
667 #ifdef HAVE_MMX2
668 //FIXME Alignment
669 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
670 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
671 #else
672 "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
673 "psrlq $32, %%mm3 \n\t"
674 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
675 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
676 #endif
677 "addl $4, %%eax \n\t"
678 "cmpl %5, %%eax \n\t"
679 " jb 1b \n\t"
680
681 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
682 "m" (yalpha1), "m" (uvalpha1)
683 : "%eax", "%ebx"
684 );
685 }
686 else if(dstbpp==16)
687 {
688 asm volatile(
689
690 YSCALEYUV2RGB
691 #ifdef DITHER16BPP
692 "paddusb g16Dither, %%mm1 \n\t"
693 "paddusb b16Dither, %%mm0 \n\t"
694 "paddusb b16Dither, %%mm3 \n\t"
695 #endif
696 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
697 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
698 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
699
700 "psrlw $3, %%mm3 \n\t"
701 "psllw $3, %%mm1 \n\t"
702 "psllw $8, %%mm0 \n\t"
703 "pand g16Mask, %%mm1 \n\t"
704 "pand r16Mask, %%mm0 \n\t"
705
706 "por %%mm3, %%mm1 \n\t"
707 "por %%mm1, %%mm0 \n\t"
708 #ifdef HAVE_MMX2
709 "movntq %%mm0, (%4, %%eax, 2) \n\t"
710 #else
711 "movq %%mm0, (%4, %%eax, 2) \n\t"
712 #endif
713 "addl $4, %%eax \n\t"
714 "cmpl %5, %%eax \n\t"
715 " jb 1b \n\t"
716
717 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
718 "m" (yalpha1), "m" (uvalpha1)
719 : "%eax"
720 );
721 }
722 #else
723 if(dstbpp==32 || dstbpp==24)
724 {
725 for(i=0;i<dstw;i++){
726 // vertical linear interpolation && yuv2rgb in a single step:
727 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
728 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
729 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
730 dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];
731 dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
732 dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];
733 dest+=dstbpp>>3;
734 }
735 }
736 else if(dstbpp==16)
737 {
738 for(i=0;i<dstw;i++){
739 // vertical linear interpolation && yuv2rgb in a single step:
740 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
741 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
742 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
743
744 ((uint16_t*)dest)[0] =
745 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
746 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 |
747 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;
748 dest+=2;
749 }
750 }
751 else if(dstbpp==15) //15bit FIXME how do i figure out if its 15 or 16?
752 {
753 for(i=0;i<dstw;i++){
754 // vertical linear interpolation && yuv2rgb in a single step:
755 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
756 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
757 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
758
759 ((uint16_t*)dest)[0] =
760 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
761 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 |
762 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;
763 dest+=2;
764 }
765 }
766 #endif
767
768 b16Dither= b16Dither1;
769 b16Dither1= b16Dither2; 1367 b16Dither1= b16Dither2;
770 b16Dither2= b16Dither; 1368 b16Dither2= b16Dither;
771 1369
772 g16Dither= g16Dither1; 1370 g16Dither= g16Dither1;
773 g16Dither1= g16Dither2; 1371 g16Dither1= g16Dither2;
774 g16Dither2= g16Dither; 1372 g16Dither2= g16Dither;
1373 #endif
775 } 1374 }
776 1375
777 #ifdef HAVE_3DNOW 1376 #ifdef HAVE_3DNOW
778 asm volatile("femms"); 1377 asm volatile("femms");
779 #elif defined (HAVE_MMX) 1378 #elif defined (HAVE_MMX)