Mercurial > mplayer.hg
comparison postproc/swscale.c @ 2316:bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
added half uv interpolation support
added prefetch
BGR15 support in MMX (untested) (so BGR15,16,24,32 are supported)
special unscaled height version (not much faster but it doesnt interpolate uv vertically)
author | michael |
---|---|
date | Sat, 20 Oct 2001 21:12:09 +0000 |
parents | 7a89cb124e81 |
children | 7d3542955132 |
comparison
equal
deleted
inserted
replaced
2315:c3c73ba53f0e | 2316:bcb229557e9b |
---|---|
1 | 1 |
2 // Software scaling and colorspace conversion routines for MPlayer | 2 // Software scaling and colorspace conversion routines for MPlayer |
3 | 3 |
4 // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu> | 4 // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu> |
5 // current version mostly by Michael Niedermayer (michaelni@gmx.at) | 5 // current version mostly by Michael Niedermayer (michaelni@gmx.at) |
6 // the parts written by michael are under GNU GPL | |
6 | 7 |
7 #include <inttypes.h> | 8 #include <inttypes.h> |
8 #include "../config.h" | 9 #include "../config.h" |
10 #include "swscale.h" | |
9 | 11 |
10 //#undef HAVE_MMX2 | 12 //#undef HAVE_MMX2 |
11 //#undef HAVE_MMX | 13 //#undef HAVE_MMX |
12 //#undef ARCH_X86 | 14 //#undef ARCH_X86 |
13 #define DITHER16BPP | 15 #define DITHER1XBPP |
14 //#define ALT_ERROR | 16 int fullUVIpol=0; |
17 //disables the unscaled height version | |
18 int allwaysIpol=0; | |
15 | 19 |
16 #define RET 0xC3 //near return opcode | 20 #define RET 0xC3 //near return opcode |
17 /* | 21 /* |
18 NOTES | 22 NOTES |
19 | 23 |
20 known BUGS with known cause (no bugreports please!) | 24 known BUGS with known cause (no bugreports please!, but patches are welcome :) ) |
21 code reads 1 sample too much (might cause a sig11) | 25 horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11) |
26 | |
27 Supported output formats BGR15 BGR16 BGR24 BGR32 (15,24 are untested) | |
28 BGR15 & BGR16 MMX verions support dithering | |
29 Special versions: fast Y 1:1 scaling (no interpolation in y direction) | |
22 | 30 |
23 TODO | 31 TODO |
24 check alignment off everything | 32 more intelligent missalignment avoidance for the horizontal scaler |
25 */ | 33 */ |
26 | 34 |
27 static uint64_t yCoeff= 0x2568256825682568LL; | 35 #define ABS(a) ((a) > 0 ? (a) : (-(a))) |
28 static uint64_t ubCoeff= 0x3343334333433343LL; | 36 |
29 static uint64_t vrCoeff= 0x40cf40cf40cf40cfLL; | 37 #ifdef HAVE_MMX2 |
30 static uint64_t ugCoeff= 0xE5E2E5E2E5E2E5E2LL; | 38 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" |
31 static uint64_t vgCoeff= 0xF36EF36EF36EF36ELL; | 39 #elif defined (HAVE_3DNOW) |
32 static uint64_t w80= 0x0080008000800080LL; | 40 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" |
33 static uint64_t w10= 0x0010001000100010LL; | 41 #endif |
34 static uint64_t bm00000111=0x0000000000FFFFFFLL; | 42 |
35 static uint64_t bm11111000=0xFFFFFFFFFF000000LL; | 43 #ifdef HAVE_MMX2 |
36 | 44 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" |
37 static uint64_t b16Dither= 0x0004000400040004LL; | 45 #else |
38 static uint64_t b16Dither1=0x0004000400040004LL; | 46 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t" |
39 static uint64_t b16Dither2=0x0602060206020602LL; | 47 #endif |
40 static uint64_t g16Dither= 0x0002000200020002LL; | 48 |
41 static uint64_t g16Dither1=0x0002000200020002LL; | 49 |
42 static uint64_t g16Dither2=0x0301030103010301LL; | 50 #ifdef HAVE_MMX |
43 | 51 static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL; |
44 static uint64_t b16Mask= 0x001F001F001F001FLL; | 52 static uint64_t __attribute__((aligned(8))) ubCoeff= 0x3343334333433343LL; |
45 static uint64_t g16Mask= 0x07E007E007E007E0LL; | 53 static uint64_t __attribute__((aligned(8))) vrCoeff= 0x40cf40cf40cf40cfLL; |
46 static uint64_t r16Mask= 0xF800F800F800F800LL; | 54 static uint64_t __attribute__((aligned(8))) ugCoeff= 0xE5E2E5E2E5E2E5E2LL; |
47 static uint64_t temp0; | 55 static uint64_t __attribute__((aligned(8))) vgCoeff= 0xF36EF36EF36EF36ELL; |
48 | 56 static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL; |
57 static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL; | |
58 static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL; | |
59 static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL; | |
60 static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL; | |
61 static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL; | |
62 | |
63 static uint64_t __attribute__((aligned(8))) b16Dither= 0x0004000400040004LL; | |
64 static uint64_t __attribute__((aligned(8))) b16Dither1=0x0004000400040004LL; | |
65 static uint64_t __attribute__((aligned(8))) b16Dither2=0x0602060206020602LL; | |
66 static uint64_t __attribute__((aligned(8))) g16Dither= 0x0002000200020002LL; | |
67 static uint64_t __attribute__((aligned(8))) g16Dither1=0x0002000200020002LL; | |
68 static uint64_t __attribute__((aligned(8))) g16Dither2=0x0301030103010301LL; | |
69 | |
70 static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL; | |
71 static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL; | |
72 static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL; | |
73 static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL; | |
74 static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL; | |
75 static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL; | |
76 | |
77 static uint64_t __attribute__((aligned(8))) temp0; | |
78 static uint64_t __attribute__((aligned(8))) asm_yalpha1; | |
79 static uint64_t __attribute__((aligned(8))) asm_uvalpha1; | |
80 #endif | |
49 | 81 |
50 // temporary storage for 4 yuv lines: | 82 // temporary storage for 4 yuv lines: |
51 // 16bit for now (mmx likes it more compact) | 83 // 16bit for now (mmx likes it more compact) |
84 #ifdef HAVE_MMX | |
85 static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048]; | |
86 static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2]; | |
87 #else | |
52 static uint16_t pix_buf_y[4][2048]; | 88 static uint16_t pix_buf_y[4][2048]; |
53 static uint16_t pix_buf_uv[2][2048*2]; | 89 static uint16_t pix_buf_uv[2][2048*2]; |
90 #endif | |
54 | 91 |
55 // clipping helper table for C implementations: | 92 // clipping helper table for C implementations: |
56 static unsigned char clip_table[768]; | 93 static unsigned char clip_table[768]; |
57 | 94 |
58 // yuv->rgb conversion tables: | 95 // yuv->rgb conversion tables: |
63 static int yuvtab_40cf[256]; | 100 static int yuvtab_40cf[256]; |
64 | 101 |
65 | 102 |
66 static uint8_t funnyYCode[10000]; | 103 static uint8_t funnyYCode[10000]; |
67 static uint8_t funnyUVCode[10000]; | 104 static uint8_t funnyUVCode[10000]; |
105 | |
106 #define FULL_YSCALEYUV2RGB \ | |
107 "pxor %%mm7, %%mm7 \n\t"\ | |
108 "movd %6, %%mm6 \n\t" /*yalpha1*/\ | |
109 "punpcklwd %%mm6, %%mm6 \n\t"\ | |
110 "punpcklwd %%mm6, %%mm6 \n\t"\ | |
111 "movd %7, %%mm5 \n\t" /*uvalpha1*/\ | |
112 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
113 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
114 "xorl %%eax, %%eax \n\t"\ | |
115 "1: \n\t"\ | |
116 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
117 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
118 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
119 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
120 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
121 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
122 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
123 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
124 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
125 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
126 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
127 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
128 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |
129 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
130 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
131 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ | |
132 "psubw w400, %%mm3 \n\t" /* 8(U-128)*/\ | |
133 "pmulhw yCoeff, %%mm1 \n\t"\ | |
134 \ | |
135 \ | |
136 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
137 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
138 "pmulhw ubCoeff, %%mm3 \n\t"\ | |
139 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
140 "pmulhw ugCoeff, %%mm2 \n\t"\ | |
141 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
142 "psubw w400, %%mm0 \n\t" /* (V-128)8*/\ | |
143 \ | |
144 \ | |
145 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | |
146 "pmulhw vrCoeff, %%mm0 \n\t"\ | |
147 "pmulhw vgCoeff, %%mm4 \n\t"\ | |
148 "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |
149 "paddw %%mm1, %%mm0 \n\t" /* R*/\ | |
150 "packuswb %%mm3, %%mm3 \n\t"\ | |
151 \ | |
152 "packuswb %%mm0, %%mm0 \n\t"\ | |
153 "paddw %%mm4, %%mm2 \n\t"\ | |
154 "paddw %%mm2, %%mm1 \n\t" /* G*/\ | |
155 \ | |
156 "packuswb %%mm1, %%mm1 \n\t" | |
157 | |
158 #define YSCALEYUV2RGB \ | |
159 "movd %6, %%mm6 \n\t" /*yalpha1*/\ | |
160 "punpcklwd %%mm6, %%mm6 \n\t"\ | |
161 "punpcklwd %%mm6, %%mm6 \n\t"\ | |
162 "movq %%mm6, asm_yalpha1 \n\t"\ | |
163 "movd %7, %%mm5 \n\t" /*uvalpha1*/\ | |
164 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
165 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
166 "movq %%mm5, asm_uvalpha1 \n\t"\ | |
167 "xorl %%eax, %%eax \n\t"\ | |
168 "1: \n\t"\ | |
169 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
170 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
171 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
172 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
173 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
174 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
175 "movq asm_uvalpha1, %%mm0 \n\t"\ | |
176 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
177 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
178 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
179 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
180 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
181 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
182 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\ | |
183 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\ | |
184 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
185 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
186 "pmulhw ugCoeff, %%mm3 \n\t"\ | |
187 "pmulhw vgCoeff, %%mm4 \n\t"\ | |
188 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
189 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
190 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
191 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\ | |
192 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\ | |
193 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
194 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
195 "pmulhw asm_yalpha1, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
196 "pmulhw asm_yalpha1, %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
197 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
198 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
199 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
200 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
201 "pmulhw ubCoeff, %%mm2 \n\t"\ | |
202 "pmulhw vrCoeff, %%mm5 \n\t"\ | |
203 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ | |
204 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\ | |
205 "pmulhw yCoeff, %%mm1 \n\t"\ | |
206 "pmulhw yCoeff, %%mm7 \n\t"\ | |
207 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
208 "paddw %%mm3, %%mm4 \n\t"\ | |
209 "movq %%mm2, %%mm0 \n\t"\ | |
210 "movq %%mm5, %%mm6 \n\t"\ | |
211 "movq %%mm4, %%mm3 \n\t"\ | |
212 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
213 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
214 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
215 "paddw %%mm1, %%mm2 \n\t"\ | |
216 "paddw %%mm1, %%mm5 \n\t"\ | |
217 "paddw %%mm1, %%mm4 \n\t"\ | |
218 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
219 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
220 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
221 "paddw %%mm7, %%mm0 \n\t"\ | |
222 "paddw %%mm7, %%mm6 \n\t"\ | |
223 "paddw %%mm7, %%mm3 \n\t"\ | |
224 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
225 "packuswb %%mm0, %%mm2 \n\t"\ | |
226 "packuswb %%mm6, %%mm5 \n\t"\ | |
227 "packuswb %%mm3, %%mm4 \n\t"\ | |
228 "pxor %%mm7, %%mm7 \n\t" | |
229 | |
230 #define YSCALEYUV2RGB1 \ | |
231 "xorl %%eax, %%eax \n\t"\ | |
232 "1: \n\t"\ | |
233 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\ | |
234 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
235 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
236 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
237 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\ | |
238 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\ | |
239 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
240 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
241 "pmulhw ugCoeff, %%mm3 \n\t"\ | |
242 "pmulhw vgCoeff, %%mm4 \n\t"\ | |
243 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
244 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
245 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
246 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
247 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
248 "pmulhw ubCoeff, %%mm2 \n\t"\ | |
249 "pmulhw vrCoeff, %%mm5 \n\t"\ | |
250 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ | |
251 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\ | |
252 "pmulhw yCoeff, %%mm1 \n\t"\ | |
253 "pmulhw yCoeff, %%mm7 \n\t"\ | |
254 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
255 "paddw %%mm3, %%mm4 \n\t"\ | |
256 "movq %%mm2, %%mm0 \n\t"\ | |
257 "movq %%mm5, %%mm6 \n\t"\ | |
258 "movq %%mm4, %%mm3 \n\t"\ | |
259 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
260 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
261 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
262 "paddw %%mm1, %%mm2 \n\t"\ | |
263 "paddw %%mm1, %%mm5 \n\t"\ | |
264 "paddw %%mm1, %%mm4 \n\t"\ | |
265 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
266 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
267 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
268 "paddw %%mm7, %%mm0 \n\t"\ | |
269 "paddw %%mm7, %%mm6 \n\t"\ | |
270 "paddw %%mm7, %%mm3 \n\t"\ | |
271 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
272 "packuswb %%mm0, %%mm2 \n\t"\ | |
273 "packuswb %%mm6, %%mm5 \n\t"\ | |
274 "packuswb %%mm3, %%mm4 \n\t"\ | |
275 "pxor %%mm7, %%mm7 \n\t" | |
276 | |
277 #define WRITEBGR32 \ | |
278 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
279 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
280 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
281 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
282 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
283 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
284 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
285 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
286 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
287 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
288 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
289 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
290 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
291 \ | |
292 MOVNTQ(%%mm0, (%4, %%eax, 4))\ | |
293 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\ | |
294 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\ | |
295 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\ | |
296 \ | |
297 "addl $8, %%eax \n\t"\ | |
298 "cmpl %5, %%eax \n\t"\ | |
299 " jb 1b \n\t" | |
300 | |
301 #define WRITEBGR16 \ | |
302 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
303 "movq %%mm4, %%mm3 \n\t" /* G */\ | |
304 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
305 \ | |
306 "punpcklbw %%mm7, %%mm3 \n\t" /* 0G0G0G0G */\ | |
307 "punpcklbw %%mm7, %%mm2 \n\t" /* 0B0B0B0B */\ | |
308 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R */\ | |
309 \ | |
310 "psrlw $3, %%mm2 \n\t"\ | |
311 "psllw $3, %%mm3 \n\t"\ | |
312 "psllw $8, %%mm5 \n\t"\ | |
313 \ | |
314 "pand g16Mask, %%mm3 \n\t"\ | |
315 "pand r16Mask, %%mm5 \n\t"\ | |
316 \ | |
317 "por %%mm3, %%mm2 \n\t"\ | |
318 "por %%mm5, %%mm2 \n\t"\ | |
319 \ | |
320 "punpckhbw %%mm7, %%mm4 \n\t" /* 0G0G0G0G */\ | |
321 "punpckhbw %%mm7, %%mm1 \n\t" /* 0B0B0B0B */\ | |
322 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R */\ | |
323 \ | |
324 "psrlw $3, %%mm1 \n\t"\ | |
325 "psllw $3, %%mm4 \n\t"\ | |
326 "psllw $8, %%mm6 \n\t"\ | |
327 \ | |
328 "pand g16Mask, %%mm4 \n\t"\ | |
329 "pand r16Mask, %%mm6 \n\t"\ | |
330 \ | |
331 "por %%mm4, %%mm1 \n\t"\ | |
332 "por %%mm6, %%mm1 \n\t"\ | |
333 \ | |
334 MOVNTQ(%%mm2, (%4, %%eax, 2))\ | |
335 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\ | |
336 \ | |
337 "addl $8, %%eax \n\t"\ | |
338 "cmpl %5, %%eax \n\t"\ | |
339 " jb 1b \n\t" | |
340 | |
341 #define WRITEBGR15 \ | |
342 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
343 "movq %%mm4, %%mm3 \n\t" /* G */\ | |
344 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
345 \ | |
346 "punpcklbw %%mm7, %%mm3 \n\t" /* 0G0G0G0G */\ | |
347 "punpcklbw %%mm7, %%mm2 \n\t" /* 0B0B0B0B */\ | |
348 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R */\ | |
349 \ | |
350 "psrlw $3, %%mm2 \n\t"\ | |
351 "psllw $2, %%mm3 \n\t"\ | |
352 "psllw $7, %%mm5 \n\t"\ | |
353 \ | |
354 "pand g15Mask, %%mm3 \n\t"\ | |
355 "pand r15Mask, %%mm5 \n\t"\ | |
356 \ | |
357 "por %%mm3, %%mm2 \n\t"\ | |
358 "por %%mm5, %%mm2 \n\t"\ | |
359 \ | |
360 "punpckhbw %%mm7, %%mm4 \n\t" /* 0G0G0G0G */\ | |
361 "punpckhbw %%mm7, %%mm1 \n\t" /* 0B0B0B0B */\ | |
362 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R */\ | |
363 \ | |
364 "psrlw $3, %%mm1 \n\t"\ | |
365 "psllw $2, %%mm4 \n\t"\ | |
366 "psllw $7, %%mm6 \n\t"\ | |
367 \ | |
368 "pand g15Mask, %%mm4 \n\t"\ | |
369 "pand r15Mask, %%mm6 \n\t"\ | |
370 \ | |
371 "por %%mm4, %%mm1 \n\t"\ | |
372 "por %%mm6, %%mm1 \n\t"\ | |
373 \ | |
374 MOVNTQ(%%mm2, (%4, %%eax, 2))\ | |
375 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\ | |
376 \ | |
377 "addl $8, %%eax \n\t"\ | |
378 "cmpl %5, %%eax \n\t"\ | |
379 " jb 1b \n\t" | |
380 // FIXME find a faster way to shuffle it to BGR24 | |
381 #define WRITEBGR24 \ | |
382 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
383 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
384 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
385 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
386 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
387 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
388 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
389 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
390 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
391 "punpcklbw %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
392 "punpckhbw %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
393 "punpcklbw %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
394 "punpckhbw %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
395 \ | |
396 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
397 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ | |
398 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 0 */\ | |
399 "pand bm11111000, %%mm0 \n\t" /* 00RGB000 0.5 */\ | |
400 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
401 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ | |
402 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ | |
403 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
404 \ | |
405 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
406 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ | |
407 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ | |
408 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ | |
409 "pand bm00001111, %%mm2 \n\t" /* 0000RGBR 1 */\ | |
410 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ | |
411 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ | |
412 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 2 */\ | |
413 "pand bm11111000, %%mm1 \n\t" /* 00RGB000 2.5 */\ | |
414 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ | |
415 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ | |
416 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ | |
417 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ | |
418 \ | |
419 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ | |
420 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ | |
421 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ | |
422 "pand bm00000111, %%mm5 \n\t" /* 00000RGB 3 */\ | |
423 "pand bm11111000, %%mm3 \n\t" /* 00RGB000 3.5 */\ | |
424 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ | |
425 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
426 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ | |
427 \ | |
428 "leal (%%eax, %%eax, 2), %%ebx \n\t"\ | |
429 MOVNTQ(%%mm0, (%4, %%ebx))\ | |
430 MOVNTQ(%%mm2, 8(%4, %%ebx))\ | |
431 MOVNTQ(%%mm3, 16(%4, %%ebx))\ | |
432 \ | |
433 "addl $8, %%eax \n\t"\ | |
434 "cmpl %5, %%eax \n\t"\ | |
435 " jb 1b \n\t" | |
436 | |
437 | |
438 /** | |
439 * vertical scale YV12 to RGB | |
440 */ | |
441 static inline void yuv2rgbX(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, | |
442 uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp) | |
443 { | |
444 int yalpha1=yalpha^4095; | |
445 int uvalpha1=uvalpha^4095; | |
446 int i; | |
447 | |
448 if(fullUVIpol) | |
449 { | |
450 | |
451 #ifdef HAVE_MMX | |
452 if(dstbpp == 32) | |
453 { | |
454 asm volatile( | |
455 | |
456 | |
457 FULL_YSCALEYUV2RGB | |
458 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
459 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
460 | |
461 "movq %%mm3, %%mm1 \n\t" | |
462 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
463 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
464 | |
465 MOVNTQ(%%mm3, (%4, %%eax, 4)) | |
466 MOVNTQ(%%mm1, 8(%4, %%eax, 4)) | |
467 | |
468 "addl $4, %%eax \n\t" | |
469 "cmpl %5, %%eax \n\t" | |
470 " jb 1b \n\t" | |
471 | |
472 | |
473 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
474 "m" (yalpha1), "m" (uvalpha1) | |
475 : "%eax" | |
476 ); | |
477 } | |
478 else if(dstbpp==24) | |
479 { | |
480 asm volatile( | |
481 | |
482 FULL_YSCALEYUV2RGB | |
483 | |
484 // lsb ... msb | |
485 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
486 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
487 | |
488 "movq %%mm3, %%mm1 \n\t" | |
489 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
490 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
491 | |
492 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0 | |
493 "psrlq $8, %%mm3 \n\t" // GR0BGR00 | |
494 "pand bm00000111, %%mm2 \n\t" // BGR00000 | |
495 "pand bm11111000, %%mm3 \n\t" // 000BGR00 | |
496 "por %%mm2, %%mm3 \n\t" // BGRBGR00 | |
497 "movq %%mm1, %%mm2 \n\t" | |
498 "psllq $48, %%mm1 \n\t" // 000000BG | |
499 "por %%mm1, %%mm3 \n\t" // BGRBGRBG | |
500 | |
501 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0 | |
502 "psrld $16, %%mm2 \n\t" // R000R000 | |
503 "psrlq $24, %%mm1 \n\t" // 0BGR0000 | |
504 "por %%mm2, %%mm1 \n\t" // RBGRR000 | |
505 | |
506 "movl %4, %%ebx \n\t" | |
507 "addl %%eax, %%ebx \n\t" | |
508 | |
509 #ifdef HAVE_MMX2 | |
510 //FIXME Alignment | |
511 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t" | |
512 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t" | |
513 #else | |
514 "movd %%mm3, (%%ebx, %%eax, 2) \n\t" | |
515 "psrlq $32, %%mm3 \n\t" | |
516 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t" | |
517 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t" | |
518 #endif | |
519 "addl $4, %%eax \n\t" | |
520 "cmpl %5, %%eax \n\t" | |
521 " jb 1b \n\t" | |
522 | |
523 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw), | |
524 "m" (yalpha1), "m" (uvalpha1) | |
525 : "%eax", "%ebx" | |
526 ); | |
527 } | |
528 else if(dstbpp==15) | |
529 { | |
530 asm volatile( | |
531 | |
532 FULL_YSCALEYUV2RGB | |
533 #ifdef DITHER1XBPP | |
534 "paddusb b16Dither, %%mm1 \n\t" | |
535 "paddusb b16Dither, %%mm0 \n\t" | |
536 "paddusb b16Dither, %%mm3 \n\t" | |
537 #endif | |
538 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |
539 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
540 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
541 | |
542 "psrlw $3, %%mm3 \n\t" | |
543 "psllw $2, %%mm1 \n\t" | |
544 "psllw $7, %%mm0 \n\t" | |
545 "pand g15Mask, %%mm1 \n\t" | |
546 "pand r15Mask, %%mm0 \n\t" | |
547 | |
548 "por %%mm3, %%mm1 \n\t" | |
549 "por %%mm1, %%mm0 \n\t" | |
550 | |
551 MOVNTQ(%%mm0, (%4, %%eax, 2)) | |
552 | |
553 "addl $4, %%eax \n\t" | |
554 "cmpl %5, %%eax \n\t" | |
555 " jb 1b \n\t" | |
556 | |
557 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
558 "m" (yalpha1), "m" (uvalpha1) | |
559 : "%eax" | |
560 ); | |
561 } | |
562 else if(dstbpp==16) | |
563 { | |
564 asm volatile( | |
565 | |
566 FULL_YSCALEYUV2RGB | |
567 #ifdef DITHER1XBPP | |
568 "paddusb g16Dither, %%mm1 \n\t" | |
569 "paddusb b16Dither, %%mm0 \n\t" | |
570 "paddusb b16Dither, %%mm3 \n\t" | |
571 #endif | |
572 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |
573 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
574 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
575 | |
576 "psrlw $3, %%mm3 \n\t" | |
577 "psllw $3, %%mm1 \n\t" | |
578 "psllw $8, %%mm0 \n\t" | |
579 "pand g16Mask, %%mm1 \n\t" | |
580 "pand r16Mask, %%mm0 \n\t" | |
581 | |
582 "por %%mm3, %%mm1 \n\t" | |
583 "por %%mm1, %%mm0 \n\t" | |
584 | |
585 MOVNTQ(%%mm0, (%4, %%eax, 2)) | |
586 | |
587 "addl $4, %%eax \n\t" | |
588 "cmpl %5, %%eax \n\t" | |
589 " jb 1b \n\t" | |
590 | |
591 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
592 "m" (yalpha1), "m" (uvalpha1) | |
593 : "%eax" | |
594 ); | |
595 } | |
596 #else | |
597 if(dstbpp==32 || dstbpp==24) | |
598 { | |
599 for(i=0;i<dstw;i++){ | |
600 // vertical linear interpolation && yuv2rgb in a single step: | |
601 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
602 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
603 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
604 dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)]; | |
605 dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]; | |
606 dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)]; | |
607 dest+=dstbpp>>3; | |
608 } | |
609 } | |
610 else if(dstbpp==16) | |
611 { | |
612 for(i=0;i<dstw;i++){ | |
613 // vertical linear interpolation && yuv2rgb in a single step: | |
614 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
615 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
616 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
617 | |
618 ((uint16_t*)dest)[0] = | |
619 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | | |
620 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 | | |
621 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800; | |
622 dest+=2; | |
623 } | |
624 } | |
625 else if(dstbpp==15) | |
626 { | |
627 for(i=0;i<dstw;i++){ | |
628 // vertical linear interpolation && yuv2rgb in a single step: | |
629 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
630 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
631 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
632 | |
633 ((uint16_t*)dest)[0] = | |
634 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | | |
635 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 | | |
636 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00; | |
637 dest+=2; | |
638 } | |
639 } | |
640 #endif | |
641 }//FULL_UV_IPOL | |
642 else | |
643 { | |
644 #ifdef HAVE_MMX | |
645 if(dstbpp == 32) | |
646 { | |
647 asm volatile( | |
648 YSCALEYUV2RGB | |
649 WRITEBGR32 | |
650 | |
651 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
652 "m" (yalpha1), "m" (uvalpha1) | |
653 : "%eax" | |
654 ); | |
655 } | |
656 else if(dstbpp==24) | |
657 { | |
658 asm volatile( | |
659 YSCALEYUV2RGB | |
660 WRITEBGR24 | |
661 | |
662 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
663 "m" (yalpha1), "m" (uvalpha1) | |
664 : "%eax", "%ebx" | |
665 ); | |
666 } | |
667 else if(dstbpp==15) | |
668 { | |
669 asm volatile( | |
670 YSCALEYUV2RGB | |
671 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
672 #ifdef DITHER1XBPP | |
673 "paddusb b16Dither, %%mm2 \n\t" | |
674 "paddusb b16Dither, %%mm4 \n\t" | |
675 "paddusb b16Dither, %%mm5 \n\t" | |
676 #endif | |
677 | |
678 WRITEBGR15 | |
679 | |
680 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
681 "m" (yalpha1), "m" (uvalpha1) | |
682 : "%eax" | |
683 ); | |
684 } | |
685 else if(dstbpp==16) | |
686 { | |
687 asm volatile( | |
688 YSCALEYUV2RGB | |
689 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
690 #ifdef DITHER1XBPP | |
691 "paddusb g16Dither, %%mm2 \n\t" | |
692 "paddusb b16Dither, %%mm4 \n\t" | |
693 "paddusb b16Dither, %%mm5 \n\t" | |
694 #endif | |
695 | |
696 WRITEBGR16 | |
697 | |
698 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
699 "m" (yalpha1), "m" (uvalpha1) | |
700 : "%eax" | |
701 ); | |
702 } | |
703 #else | |
704 //FIXME unroll C loop and dont recalculate UV | |
705 if(dstbpp==32 || dstbpp==24) | |
706 { | |
707 for(i=0;i<dstw;i++){ | |
708 // vertical linear interpolation && yuv2rgb in a single step: | |
709 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
710 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); | |
711 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); | |
712 dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)]; | |
713 dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]; | |
714 dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)]; | |
715 dest+=dstbpp>>3; | |
716 } | |
717 } | |
718 else if(dstbpp==16) | |
719 { | |
720 for(i=0;i<dstw;i++){ | |
721 // vertical linear interpolation && yuv2rgb in a single step: | |
722 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
723 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); | |
724 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); | |
725 | |
726 ((uint16_t*)dest)[0] = | |
727 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | | |
728 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 | | |
729 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800; | |
730 dest+=2; | |
731 } | |
732 } | |
733 else if(dstbpp==15) | |
734 { | |
735 for(i=0;i<dstw;i++){ | |
736 // vertical linear interpolation && yuv2rgb in a single step: | |
737 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
738 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); | |
739 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); | |
740 | |
741 ((uint16_t*)dest)[0] = | |
742 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | | |
743 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 | | |
744 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00; | |
745 dest+=2; | |
746 } | |
747 } | |
748 #endif | |
749 } //!FULL_UV_IPOL | |
750 } | |
751 | |
752 /** | |
753 * YV12 to RGB without scaling or interpolating | |
754 */ | |
755 static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, | |
756 uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp) | |
757 { | |
758 int yalpha1=yalpha^4095; | |
759 int uvalpha1=uvalpha^4095; | |
760 int i; | |
761 if(fullUVIpol || allwaysIpol) | |
762 { | |
763 yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp); | |
764 return; | |
765 } | |
766 #ifdef HAVE_MMX | |
767 if(dstbpp == 32) | |
768 { | |
769 asm volatile( | |
770 YSCALEYUV2RGB1 | |
771 WRITEBGR32 | |
772 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
773 "m" (yalpha1), "m" (uvalpha1) | |
774 : "%eax" | |
775 ); | |
776 } | |
777 else if(dstbpp==24) | |
778 { | |
779 asm volatile( | |
780 YSCALEYUV2RGB1 | |
781 WRITEBGR24 | |
782 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
783 "m" (yalpha1), "m" (uvalpha1) | |
784 : "%eax", "%ebx" | |
785 ); | |
786 } | |
787 else if(dstbpp==15) | |
788 { | |
789 asm volatile( | |
790 YSCALEYUV2RGB1 | |
791 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
792 #ifdef DITHER1XBPP | |
793 "paddusb b16Dither, %%mm2 \n\t" | |
794 "paddusb b16Dither, %%mm4 \n\t" | |
795 "paddusb b16Dither, %%mm5 \n\t" | |
796 #endif | |
797 WRITEBGR15 | |
798 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
799 "m" (yalpha1), "m" (uvalpha1) | |
800 : "%eax" | |
801 ); | |
802 } | |
803 else if(dstbpp==16) | |
804 { | |
805 asm volatile( | |
806 YSCALEYUV2RGB1 | |
807 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
808 #ifdef DITHER1XBPP | |
809 "paddusb g16Dither, %%mm2 \n\t" | |
810 "paddusb b16Dither, %%mm4 \n\t" | |
811 "paddusb b16Dither, %%mm5 \n\t" | |
812 #endif | |
813 | |
814 WRITEBGR16 | |
815 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
816 "m" (yalpha1), "m" (uvalpha1) | |
817 : "%eax" | |
818 ); | |
819 } | |
820 #else | |
821 //FIXME unroll C loop and dont recalculate UV | |
822 if(dstbpp==32 || dstbpp==24) | |
823 { | |
824 for(i=0;i<dstw;i++){ | |
825 // vertical linear interpolation && yuv2rgb in a single step: | |
826 int Y=yuvtab_2568[buf0[i]>>7]; | |
827 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); | |
828 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); | |
829 dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)]; | |
830 dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]; | |
831 dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)]; | |
832 dest+=dstbpp>>3; | |
833 } | |
834 } | |
835 else if(dstbpp==16) | |
836 { | |
837 for(i=0;i<dstw;i++){ | |
838 // vertical linear interpolation && yuv2rgb in a single step: | |
839 int Y=yuvtab_2568[buf0[i]>>7]; | |
840 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); | |
841 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); | |
842 | |
843 ((uint16_t*)dest)[0] = | |
844 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | | |
845 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 | | |
846 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800; | |
847 dest+=2; | |
848 } | |
849 } | |
850 else if(dstbpp==15) | |
851 { | |
852 for(i=0;i<dstw;i++){ | |
853 // vertical linear interpolation && yuv2rgb in a single step: | |
854 int Y=yuvtab_2568[buf0[i]>>7]; | |
855 int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); | |
856 int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); | |
857 | |
858 ((uint16_t*)dest)[0] = | |
859 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | | |
860 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 | | |
861 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00; | |
862 dest+=2; | |
863 } | |
864 } | |
865 #endif | |
866 } | |
867 | |
868 | |
68 | 869 |
69 | 870 |
70 // *** bilinear scaling and yuv->rgb conversion of yv12 slices: | 871 // *** bilinear scaling and yuv->rgb conversion of yv12 slices: |
71 // *** Note: it's called multiple times while decoding a frame, first time y==0 | 872 // *** Note: it's called multiple times while decoding a frame, first time y==0 |
72 // *** Designed to upscale, but may work for downscale too. | 873 // *** Designed to upscale, but may work for downscale too. |
93 | 894 |
94 #ifdef HAVE_MMX2 | 895 #ifdef HAVE_MMX2 |
95 // used to detect a horizontal size change | 896 // used to detect a horizontal size change |
96 static int old_dstw= -1; | 897 static int old_dstw= -1; |
97 static int old_s_xinc= -1; | 898 static int old_s_xinc= -1; |
98 | 899 #endif |
99 #endif | 900 |
100 int canMMX2BeUsed=0; | 901 int canMMX2BeUsed=0; |
101 int srcWidth= (dstw*s_xinc + 0x8000)>>16; | 902 int srcWidth= (dstw*s_xinc + 0x8000)>>16; |
903 int dstUVw= fullUVIpol ? dstw : dstw/2; | |
904 | |
102 | 905 |
103 #ifdef HAVE_MMX2 | 906 #ifdef HAVE_MMX2 |
104 canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0 && (srcWidth&15)==0) ? 1 : 0; | 907 canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0 && (srcWidth&15)==0) ? 1 : 0; |
105 #endif | 908 #endif |
106 | 909 |
109 // FIXME this is not perfect, but noone shuld notice the difference, the more correct variant | 912 // FIXME this is not perfect, but noone shuld notice the difference, the more correct variant |
110 // would be like the vertical one, but that would require some special code for the | 913 // would be like the vertical one, but that would require some special code for the |
111 // first and last pixel | 914 // first and last pixel |
112 if(canMMX2BeUsed) s_xinc+= 20; | 915 if(canMMX2BeUsed) s_xinc+= 20; |
113 else s_xinc = ((srcWidth-2)<<16)/(dstw-2) - 20; | 916 else s_xinc = ((srcWidth-2)<<16)/(dstw-2) - 20; |
114 s_xinc2=s_xinc>>1; | 917 |
115 | 918 if(fullUVIpol) s_xinc2= s_xinc>>1; |
919 else s_xinc2= s_xinc; | |
116 // force calculation of the horizontal interpolation of the first line | 920 // force calculation of the horizontal interpolation of the first line |
117 s_last_ypos=-99; | 921 s_last_ypos=-99; |
118 s_last_y1pos=-99; | 922 s_last_y1pos=-99; |
119 | 923 |
120 if(y==0){ | 924 if(y==0){ |
213 | 1017 |
214 funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]= | 1018 funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]= |
215 funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]= | 1019 funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]= |
216 a | (b<<2) | (c<<4) | (d<<6); | 1020 a | (b<<2) | (c<<4) | (d<<6); |
217 | 1021 |
1022 // if we dont need to read 8 bytes than dont :), reduces the chance of | |
1023 // crossing a cache line | |
1024 if(d<3) funnyYCode[fragmentLength*i/4 + 1]= 0x6E; | |
1025 | |
218 funnyYCode[fragmentLength*(i+4)/4]= RET; | 1026 funnyYCode[fragmentLength*(i+4)/4]= RET; |
219 } | 1027 } |
220 xpos+=s_xinc; | 1028 xpos+=s_xinc; |
221 } | 1029 } |
222 | 1030 |
223 xpos= 0; //s_xinc2/2 - 0x10000; // difference between centers of chrom samples | 1031 xpos= 0; //s_xinc2/2 - 0x10000; // difference between centers of chrom samples |
224 for(i=0; i<dstw/8; i++) | 1032 for(i=0; i<dstUVw/8; i++) |
225 { | 1033 { |
226 int xx=xpos>>16; | 1034 int xx=xpos>>16; |
227 | 1035 |
228 if((i&3) == 0) | 1036 if((i&3) == 0) |
229 { | 1037 { |
236 | 1044 |
237 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]= | 1045 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]= |
238 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]= | 1046 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]= |
239 a | (b<<2) | (c<<4) | (d<<6); | 1047 a | (b<<2) | (c<<4) | (d<<6); |
240 | 1048 |
1049 // if we dont need to read 8 bytes than dont :), reduces the chance of | |
1050 // crossing a cache line | |
1051 if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E; | |
1052 | |
241 funnyUVCode[fragmentLength*(i+4)/4]= RET; | 1053 funnyUVCode[fragmentLength*(i+4)/4]= RET; |
242 } | 1054 } |
243 xpos+=s_xinc2; | 1055 xpos+=s_xinc2; |
244 } | 1056 } |
245 // funnyCode[0]= RET; | 1057 // funnyCode[0]= RET; |
253 unsigned char *dest=dstptr+dststride*s_ypos; | 1065 unsigned char *dest=dstptr+dststride*s_ypos; |
254 int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line | 1066 int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line |
255 // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) | 1067 // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) |
256 int srcuvpos= s_srcypos + s_yinc/2 - 0x8000; | 1068 int srcuvpos= s_srcypos + s_yinc/2 - 0x8000; |
257 int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line | 1069 int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line |
258 int yalpha=((s_srcypos-1)&0xFFFF)>>7; | 1070 int yalpha=((s_srcypos-1)&0xFFFF)>>4; |
259 int yalpha1=yalpha^511; | 1071 int uvalpha=((srcuvpos-1)&0x1FFFF)>>5; |
260 int uvalpha=((srcuvpos-1)&0x1FFFF)>>8; | |
261 int uvalpha1=uvalpha^511; | |
262 uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice | 1072 uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice |
263 uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice | 1073 uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice |
264 uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice | 1074 uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice |
265 uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice | 1075 uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice |
266 int i; | 1076 int i; |
318 "movl %1, %%edi \n\t" // buf1 | 1128 "movl %1, %%edi \n\t" // buf1 |
319 "movl %3, %%edx \n\t" // (s_xinc*4)>>16 | 1129 "movl %3, %%edx \n\t" // (s_xinc*4)>>16 |
320 "xorl %%ecx, %%ecx \n\t" | 1130 "xorl %%ecx, %%ecx \n\t" |
321 "xorl %%ebx, %%ebx \n\t" | 1131 "xorl %%ebx, %%ebx \n\t" |
322 "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF | 1132 "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF |
323 // "int $3\n\t" | 1133 #ifdef HAVE_MMX2 |
324 "call funnyYCode \n\t" | 1134 #define FUNNY_Y_CODE \ |
325 "movq temp0, %%mm2 \n\t" | 1135 "prefetchnta 1024(%%esi) \n\t"\ |
1136 "prefetchnta 1056(%%esi) \n\t"\ | |
1137 "prefetchnta 1088(%%esi) \n\t"\ | |
1138 "call funnyYCode \n\t"\ | |
1139 "movq temp0, %%mm2 \n\t"\ | |
326 "xorl %%ecx, %%ecx \n\t" | 1140 "xorl %%ecx, %%ecx \n\t" |
327 "call funnyYCode \n\t" | 1141 #else |
328 "movq temp0, %%mm2 \n\t" | 1142 #define FUNNY_Y_CODE \ |
1143 "call funnyYCode \n\t"\ | |
1144 "movq temp0, %%mm2 \n\t"\ | |
329 "xorl %%ecx, %%ecx \n\t" | 1145 "xorl %%ecx, %%ecx \n\t" |
330 "call funnyYCode \n\t" | 1146 #endif |
331 "movq temp0, %%mm2 \n\t" | 1147 FUNNY_Y_CODE |
332 "xorl %%ecx, %%ecx \n\t" | 1148 FUNNY_Y_CODE |
333 "call funnyYCode \n\t" | 1149 FUNNY_Y_CODE |
334 "movq temp0, %%mm2 \n\t" | 1150 FUNNY_Y_CODE |
335 "xorl %%ecx, %%ecx \n\t" | 1151 FUNNY_Y_CODE |
336 "call funnyYCode \n\t" | 1152 FUNNY_Y_CODE |
337 "movq temp0, %%mm2 \n\t" | 1153 FUNNY_Y_CODE |
338 "xorl %%ecx, %%ecx \n\t" | 1154 FUNNY_Y_CODE |
339 "call funnyYCode \n\t" | 1155 |
340 "movq temp0, %%mm2 \n\t" | |
341 "xorl %%ecx, %%ecx \n\t" | |
342 "call funnyYCode \n\t" | |
343 "movq temp0, %%mm2 \n\t" | |
344 "xorl %%ecx, %%ecx \n\t" | |
345 "call funnyYCode \n\t" | |
346 :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16), | 1156 :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16), |
347 "m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF) | 1157 "m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF) |
348 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | 1158 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" |
349 ); | 1159 ); |
350 for(i=dstw-1; (i*s_xinc)>>16 >=srcWidth-1; i--) buf1[i] = src[srcWidth-1]*128; | 1160 for(i=dstw-1; (i*s_xinc)>>16 >=srcWidth-1; i--) buf1[i] = src[srcWidth-1]*128; |
351 } | 1161 } |
352 else | 1162 else |
353 { | 1163 { |
354 #endif | 1164 #endif |
355 //NO MMX just normal asm ... FIXME try/write funny MMX2 variant | 1165 //NO MMX just normal asm ... |
356 //FIXME add prefetch | |
357 asm volatile( | 1166 asm volatile( |
358 "xorl %%eax, %%eax \n\t" // i | 1167 "xorl %%eax, %%eax \n\t" // i |
359 "xorl %%ebx, %%ebx \n\t" // xx | 1168 "xorl %%ebx, %%ebx \n\t" // xx |
360 "xorl %%ecx, %%ecx \n\t" // 2*xalpha | 1169 "xorl %%ecx, %%ecx \n\t" // 2*xalpha |
361 "1: \n\t" | 1170 "1: \n\t" |
436 "movl %3, %%edx \n\t" // (s_xinc*4)>>16 | 1245 "movl %3, %%edx \n\t" // (s_xinc*4)>>16 |
437 "xorl %%ecx, %%ecx \n\t" | 1246 "xorl %%ecx, %%ecx \n\t" |
438 "xorl %%ebx, %%ebx \n\t" | 1247 "xorl %%ebx, %%ebx \n\t" |
439 "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF | 1248 "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF |
440 | 1249 |
441 // "int $3\n\t" | 1250 #ifdef HAVE_MMX2 |
442 #define FUNNYUVCODE \ | 1251 #define FUNNYUVCODE \ |
443 "call funnyUVCode \n\t"\ | 1252 "prefetchnta 1024(%%esi) \n\t"\ |
444 "movq temp0, %%mm2 \n\t"\ | 1253 "prefetchnta 1056(%%esi) \n\t"\ |
445 "xorl %%ecx, %%ecx \n\t" | 1254 "prefetchnta 1088(%%esi) \n\t"\ |
446 | 1255 "call funnyUVCode \n\t"\ |
447 FUNNYUVCODE | 1256 "movq temp0, %%mm2 \n\t"\ |
448 FUNNYUVCODE | 1257 "xorl %%ecx, %%ecx \n\t" |
449 FUNNYUVCODE | 1258 #else |
450 FUNNYUVCODE | 1259 #define FUNNYUVCODE \ |
451 | 1260 "call funnyUVCode \n\t"\ |
452 FUNNYUVCODE | 1261 "movq temp0, %%mm2 \n\t"\ |
453 FUNNYUVCODE | 1262 "xorl %%ecx, %%ecx \n\t" |
454 FUNNYUVCODE | 1263 #endif |
455 FUNNYUVCODE | 1264 |
456 | 1265 FUNNYUVCODE |
1266 FUNNYUVCODE | |
1267 FUNNYUVCODE | |
1268 FUNNYUVCODE | |
1269 | |
1270 FUNNYUVCODE | |
1271 FUNNYUVCODE | |
1272 FUNNYUVCODE | |
1273 FUNNYUVCODE | |
457 | 1274 |
458 | 1275 |
459 "xorl %%eax, %%eax \n\t" // i | 1276 "xorl %%eax, %%eax \n\t" // i |
460 "movl %6, %%esi \n\t" // src | 1277 "movl %6, %%esi \n\t" // src |
461 "movl %1, %%edi \n\t" // buf1 | 1278 "movl %1, %%edi \n\t" // buf1 |
469 FUNNYUVCODE | 1286 FUNNYUVCODE |
470 FUNNYUVCODE | 1287 FUNNYUVCODE |
471 FUNNYUVCODE | 1288 FUNNYUVCODE |
472 FUNNYUVCODE | 1289 FUNNYUVCODE |
473 | 1290 |
474 :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>16), | 1291 :: "m" (src1), "m" (uvbuf1), "m" (dstUVw), "m" ((s_xinc2*4)>>16), |
475 "m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2) | 1292 "m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2) |
476 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | 1293 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" |
477 ); | 1294 ); |
478 for(i=dstw-1; (i*s_xinc2)>>16 >=srcWidth/2-1; i--) | 1295 for(i=dstUVw-1; (i*s_xinc2)>>16 >=srcWidth/2-1; i--) |
479 { | 1296 { |
480 uvbuf1[i] = src1[srcWidth/2-1]*128; | 1297 uvbuf1[i] = src1[srcWidth/2-1]*128; |
481 uvbuf1[i+2048] = src2[srcWidth/2-1]*128; | 1298 uvbuf1[i+2048] = src2[srcWidth/2-1]*128; |
482 } | 1299 } |
483 } | 1300 } |
514 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | 1331 "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry |
515 "addl $1, %%eax \n\t" | 1332 "addl $1, %%eax \n\t" |
516 "cmpl %2, %%eax \n\t" | 1333 "cmpl %2, %%eax \n\t" |
517 " jb 1b \n\t" | 1334 " jb 1b \n\t" |
518 | 1335 |
519 | 1336 :: "m" (src1), "m" (uvbuf1), "m" (dstUVw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF), |
520 :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF), | |
521 "r" (src2) | 1337 "r" (src2) |
522 : "%eax", "%ebx", "%ecx", "%edi", "%esi" | 1338 : "%eax", "%ebx", "%ecx", "%edi", "%esi" |
523 ); | 1339 ); |
524 #ifdef HAVE_MMX2 | 1340 #ifdef HAVE_MMX2 |
525 } //if MMX2 cant be used | 1341 } //if MMX2 cant be used |
526 #endif | 1342 #endif |
527 #else | 1343 #else |
528 for(i=0;i<dstw;i++){ | 1344 for(i=0;i<dstUVw;i++){ |
529 register unsigned int xx=xpos>>16; | 1345 register unsigned int xx=xpos>>16; |
530 register unsigned int xalpha=(xpos&0xFFFF)>>9; | 1346 register unsigned int xalpha=(xpos&0xFFFF)>>9; |
531 uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | 1347 uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); |
532 uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | 1348 uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); |
533 xpos+=s_xinc2; | 1349 xpos+=s_xinc2; |
539 s_srcypos= s_yinc/2 - 0x8000; | 1355 s_srcypos= s_yinc/2 - 0x8000; |
540 continue; | 1356 continue; |
541 } | 1357 } |
542 } | 1358 } |
543 | 1359 |
544 | 1360 if(ABS(s_yinc - 0x10000) < 10) |
545 // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization... | 1361 yuv2rgb1(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp); |
546 // Re: Note1: ok n*4 for now | 1362 else |
547 // Note2: instead of using lookup tabs, mmx version could do the multiply... | 1363 yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp); |
548 // Re: Note2: yep | 1364 |
549 // Note3: maybe we should make separated 15/16, 24 and 32bpp version of this: | |
550 // Re: done (32 & 16) and 16 has dithering :) but 16 is untested | |
551 #ifdef HAVE_MMX | 1365 #ifdef HAVE_MMX |
552 //FIXME write lq version with less uv ... | 1366 b16Dither= b16Dither1; |
553 //FIXME reorder / optimize | |
554 if(dstbpp == 32) | |
555 { | |
556 asm volatile( | |
557 | |
558 #define YSCALEYUV2RGB \ | |
559 "pxor %%mm7, %%mm7 \n\t"\ | |
560 "movd %6, %%mm6 \n\t" /*yalpha1*/\ | |
561 "punpcklwd %%mm6, %%mm6 \n\t"\ | |
562 "punpcklwd %%mm6, %%mm6 \n\t"\ | |
563 "movd %7, %%mm5 \n\t" /*uvalpha1*/\ | |
564 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
565 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
566 "xorl %%eax, %%eax \n\t"\ | |
567 "1: \n\t"\ | |
568 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
569 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
570 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
571 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
572 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
573 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
574 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
575 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
576 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ | |
577 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
578 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\ | |
579 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
580 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |
581 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
582 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
583 "psubw w10, %%mm1 \n\t" /* Y-16*/\ | |
584 "psubw w80, %%mm3 \n\t" /* (U-128)*/\ | |
585 "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ | |
586 "psllw $3, %%mm3 \n\t" /*(U-128)8*/\ | |
587 "pmulhw yCoeff, %%mm1 \n\t"\ | |
588 \ | |
589 \ | |
590 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
591 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
592 "pmulhw ubCoeff, %%mm3 \n\t"\ | |
593 "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\ | |
594 "pmulhw ugCoeff, %%mm2 \n\t"\ | |
595 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
596 "psubw w80, %%mm0 \n\t" /* (V-128)*/\ | |
597 "psllw $3, %%mm0 \n\t" /* (V-128)8*/\ | |
598 \ | |
599 \ | |
600 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | |
601 "pmulhw vrCoeff, %%mm0 \n\t"\ | |
602 "pmulhw vgCoeff, %%mm4 \n\t"\ | |
603 "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |
604 "paddw %%mm1, %%mm0 \n\t" /* R*/\ | |
605 "packuswb %%mm3, %%mm3 \n\t"\ | |
606 \ | |
607 "packuswb %%mm0, %%mm0 \n\t"\ | |
608 "paddw %%mm4, %%mm2 \n\t"\ | |
609 "paddw %%mm2, %%mm1 \n\t" /* G*/\ | |
610 \ | |
611 "packuswb %%mm1, %%mm1 \n\t" | |
612 | |
613 YSCALEYUV2RGB | |
614 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
615 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
616 | |
617 "movq %%mm3, %%mm1 \n\t" | |
618 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
619 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
620 #ifdef HAVE_MMX2 | |
621 "movntq %%mm3, (%4, %%eax, 4) \n\t" | |
622 "movntq %%mm1, 8(%4, %%eax, 4) \n\t" | |
623 #else | |
624 "movq %%mm3, (%4, %%eax, 4) \n\t" | |
625 "movq %%mm1, 8(%4, %%eax, 4) \n\t" | |
626 #endif | |
627 "addl $4, %%eax \n\t" | |
628 "cmpl %5, %%eax \n\t" | |
629 " jb 1b \n\t" | |
630 | |
631 | |
632 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
633 "m" (yalpha1), "m" (uvalpha1) | |
634 : "%eax" | |
635 ); | |
636 } | |
637 else if(dstbpp==24) | |
638 { | |
639 asm volatile( | |
640 | |
641 YSCALEYUV2RGB | |
642 | |
643 // lsb ... msb | |
644 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
645 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
646 | |
647 "movq %%mm3, %%mm1 \n\t" | |
648 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
649 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
650 | |
651 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0 | |
652 "psrlq $8, %%mm3 \n\t" // GR0BGR00 | |
653 "pand bm00000111, %%mm2 \n\t" // BGR00000 | |
654 "pand bm11111000, %%mm3 \n\t" // 000BGR00 | |
655 "por %%mm2, %%mm3 \n\t" // BGRBGR00 | |
656 "movq %%mm1, %%mm2 \n\t" | |
657 "psllq $48, %%mm1 \n\t" // 000000BG | |
658 "por %%mm1, %%mm3 \n\t" // BGRBGRBG | |
659 | |
660 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0 | |
661 "psrld $16, %%mm2 \n\t" // R000R000 | |
662 "psrlq $24, %%mm1 \n\t" // 0BGR0000 | |
663 "por %%mm2, %%mm1 \n\t" // RBGRR000 | |
664 | |
665 "movl %4, %%ebx \n\t" | |
666 "addl %%eax, %%ebx \n\t" | |
667 #ifdef HAVE_MMX2 | |
668 //FIXME Alignment | |
669 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t" | |
670 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t" | |
671 #else | |
672 "movd %%mm3, (%%ebx, %%eax, 2) \n\t" | |
673 "psrlq $32, %%mm3 \n\t" | |
674 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t" | |
675 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t" | |
676 #endif | |
677 "addl $4, %%eax \n\t" | |
678 "cmpl %5, %%eax \n\t" | |
679 " jb 1b \n\t" | |
680 | |
681 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw), | |
682 "m" (yalpha1), "m" (uvalpha1) | |
683 : "%eax", "%ebx" | |
684 ); | |
685 } | |
686 else if(dstbpp==16) | |
687 { | |
688 asm volatile( | |
689 | |
690 YSCALEYUV2RGB | |
691 #ifdef DITHER16BPP | |
692 "paddusb g16Dither, %%mm1 \n\t" | |
693 "paddusb b16Dither, %%mm0 \n\t" | |
694 "paddusb b16Dither, %%mm3 \n\t" | |
695 #endif | |
696 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |
697 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
698 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
699 | |
700 "psrlw $3, %%mm3 \n\t" | |
701 "psllw $3, %%mm1 \n\t" | |
702 "psllw $8, %%mm0 \n\t" | |
703 "pand g16Mask, %%mm1 \n\t" | |
704 "pand r16Mask, %%mm0 \n\t" | |
705 | |
706 "por %%mm3, %%mm1 \n\t" | |
707 "por %%mm1, %%mm0 \n\t" | |
708 #ifdef HAVE_MMX2 | |
709 "movntq %%mm0, (%4, %%eax, 2) \n\t" | |
710 #else | |
711 "movq %%mm0, (%4, %%eax, 2) \n\t" | |
712 #endif | |
713 "addl $4, %%eax \n\t" | |
714 "cmpl %5, %%eax \n\t" | |
715 " jb 1b \n\t" | |
716 | |
717 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |
718 "m" (yalpha1), "m" (uvalpha1) | |
719 : "%eax" | |
720 ); | |
721 } | |
722 #else | |
723 if(dstbpp==32 || dstbpp==24) | |
724 { | |
725 for(i=0;i<dstw;i++){ | |
726 // vertical linear interpolation && yuv2rgb in a single step: | |
727 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)]; | |
728 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16); | |
729 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16); | |
730 dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)]; | |
731 dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]; | |
732 dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)]; | |
733 dest+=dstbpp>>3; | |
734 } | |
735 } | |
736 else if(dstbpp==16) | |
737 { | |
738 for(i=0;i<dstw;i++){ | |
739 // vertical linear interpolation && yuv2rgb in a single step: | |
740 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)]; | |
741 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16); | |
742 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16); | |
743 | |
744 ((uint16_t*)dest)[0] = | |
745 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | | |
746 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 | | |
747 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800; | |
748 dest+=2; | |
749 } | |
750 } | |
751 else if(dstbpp==15) //15bit FIXME how do i figure out if its 15 or 16? | |
752 { | |
753 for(i=0;i<dstw;i++){ | |
754 // vertical linear interpolation && yuv2rgb in a single step: | |
755 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)]; | |
756 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16); | |
757 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16); | |
758 | |
759 ((uint16_t*)dest)[0] = | |
760 (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | | |
761 (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 | | |
762 (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00; | |
763 dest+=2; | |
764 } | |
765 } | |
766 #endif | |
767 | |
768 b16Dither= b16Dither1; | |
769 b16Dither1= b16Dither2; | 1367 b16Dither1= b16Dither2; |
770 b16Dither2= b16Dither; | 1368 b16Dither2= b16Dither; |
771 | 1369 |
772 g16Dither= g16Dither1; | 1370 g16Dither= g16Dither1; |
773 g16Dither1= g16Dither2; | 1371 g16Dither1= g16Dither2; |
774 g16Dither2= g16Dither; | 1372 g16Dither2= g16Dither; |
1373 #endif | |
775 } | 1374 } |
776 | 1375 |
777 #ifdef HAVE_3DNOW | 1376 #ifdef HAVE_3DNOW |
778 asm volatile("femms"); | 1377 asm volatile("femms"); |
779 #elif defined (HAVE_MMX) | 1378 #elif defined (HAVE_MMX) |