18861
|
1 /*
|
|
2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
|
|
3
|
|
4 This program is free software; you can redistribute it and/or modify
|
|
5 it under the terms of the GNU General Public License as published by
|
|
6 the Free Software Foundation; either version 2 of the License, or
|
|
7 (at your option) any later version.
|
|
8
|
|
9 This program is distributed in the hope that it will be useful,
|
|
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
12 GNU General Public License for more details.
|
|
13
|
|
14 You should have received a copy of the GNU General Public License
|
|
15 along with this program; if not, write to the Free Software
|
|
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
17 */
|
|
18
|
|
19 #include "asmalign.h"
|
|
20
|
|
21 #undef REAL_MOVNTQ
|
|
22 #undef MOVNTQ
|
|
23 #undef PAVGB
|
|
24 #undef PREFETCH
|
|
25 #undef PREFETCHW
|
|
26 #undef EMMS
|
|
27 #undef SFENCE
|
|
28
|
|
29 #ifdef HAVE_3DNOW
|
|
30 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
|
|
31 #define EMMS "femms"
|
|
32 #else
|
|
33 #define EMMS "emms"
|
|
34 #endif
|
|
35
|
|
36 #ifdef HAVE_3DNOW
|
|
37 #define PREFETCH "prefetch"
|
|
38 #define PREFETCHW "prefetchw"
|
|
39 #elif defined ( HAVE_MMX2 )
|
|
40 #define PREFETCH "prefetchnta"
|
|
41 #define PREFETCHW "prefetcht0"
|
|
42 #else
|
|
43 #define PREFETCH "/nop"
|
|
44 #define PREFETCHW "/nop"
|
|
45 #endif
|
|
46
|
|
47 #ifdef HAVE_MMX2
|
|
48 #define SFENCE "sfence"
|
|
49 #else
|
|
50 #define SFENCE "/nop"
|
|
51 #endif
|
|
52
|
|
53 #ifdef HAVE_MMX2
|
|
54 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
|
|
55 #elif defined (HAVE_3DNOW)
|
|
56 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
|
|
57 #endif
|
|
58
|
|
59 #ifdef HAVE_MMX2
|
|
60 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
|
|
61 #else
|
|
62 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
|
|
63 #endif
|
|
64 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
|
|
65
|
|
66 #ifdef HAVE_ALTIVEC
|
|
67 #include "swscale_altivec_template.c"
|
|
68 #endif
|
|
69
|
|
70 #define YSCALEYUV2YV12X(x, offset) \
|
|
71 "xor %%"REG_a", %%"REG_a" \n\t"\
|
|
72 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
|
|
73 "movq %%mm3, %%mm4 \n\t"\
|
|
74 "lea " offset "(%0), %%"REG_d" \n\t"\
|
|
75 "mov (%%"REG_d"), %%"REG_S" \n\t"\
|
|
76 ASMALIGN16 /* FIXME Unroll? */\
|
|
77 "1: \n\t"\
|
|
78 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
|
|
79 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
|
|
80 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
|
|
81 "add $16, %%"REG_d" \n\t"\
|
|
82 "mov (%%"REG_d"), %%"REG_S" \n\t"\
|
|
83 "test %%"REG_S", %%"REG_S" \n\t"\
|
|
84 "pmulhw %%mm0, %%mm2 \n\t"\
|
|
85 "pmulhw %%mm0, %%mm5 \n\t"\
|
|
86 "paddw %%mm2, %%mm3 \n\t"\
|
|
87 "paddw %%mm5, %%mm4 \n\t"\
|
|
88 " jnz 1b \n\t"\
|
|
89 "psraw $3, %%mm3 \n\t"\
|
|
90 "psraw $3, %%mm4 \n\t"\
|
|
91 "packuswb %%mm4, %%mm3 \n\t"\
|
|
92 MOVNTQ(%%mm3, (%1, %%REGa))\
|
|
93 "add $8, %%"REG_a" \n\t"\
|
|
94 "cmp %2, %%"REG_a" \n\t"\
|
|
95 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
|
|
96 "movq %%mm3, %%mm4 \n\t"\
|
|
97 "lea " offset "(%0), %%"REG_d" \n\t"\
|
|
98 "mov (%%"REG_d"), %%"REG_S" \n\t"\
|
|
99 "jb 1b \n\t"
|
|
100
|
|
101 #define YSCALEYUV2YV121 \
|
|
102 "mov %2, %%"REG_a" \n\t"\
|
|
103 ASMALIGN16 /* FIXME Unroll? */\
|
|
104 "1: \n\t"\
|
|
105 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
|
|
106 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
|
|
107 "psraw $7, %%mm0 \n\t"\
|
|
108 "psraw $7, %%mm1 \n\t"\
|
|
109 "packuswb %%mm1, %%mm0 \n\t"\
|
|
110 MOVNTQ(%%mm0, (%1, %%REGa))\
|
|
111 "add $8, %%"REG_a" \n\t"\
|
|
112 "jnc 1b \n\t"
|
|
113
|
|
114 /*
|
|
115 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
|
|
116 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
|
|
117 "r" (dest), "m" (dstW),
|
|
118 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
|
|
119 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
|
|
120 */
|
|
121 #define YSCALEYUV2PACKEDX \
|
|
122 "xor %%"REG_a", %%"REG_a" \n\t"\
|
|
123 ASMALIGN16\
|
|
124 "nop \n\t"\
|
|
125 "1: \n\t"\
|
|
126 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
|
|
127 "mov (%%"REG_d"), %%"REG_S" \n\t"\
|
|
128 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
|
|
129 "movq %%mm3, %%mm4 \n\t"\
|
|
130 ASMALIGN16\
|
|
131 "2: \n\t"\
|
|
132 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
|
|
133 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
|
|
134 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
|
|
135 "add $16, %%"REG_d" \n\t"\
|
|
136 "mov (%%"REG_d"), %%"REG_S" \n\t"\
|
|
137 "pmulhw %%mm0, %%mm2 \n\t"\
|
|
138 "pmulhw %%mm0, %%mm5 \n\t"\
|
|
139 "paddw %%mm2, %%mm3 \n\t"\
|
|
140 "paddw %%mm5, %%mm4 \n\t"\
|
|
141 "test %%"REG_S", %%"REG_S" \n\t"\
|
|
142 " jnz 2b \n\t"\
|
|
143 \
|
|
144 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
|
|
145 "mov (%%"REG_d"), %%"REG_S" \n\t"\
|
|
146 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
|
|
147 "movq %%mm1, %%mm7 \n\t"\
|
|
148 ASMALIGN16\
|
|
149 "2: \n\t"\
|
|
150 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
|
|
151 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
|
|
152 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
|
|
153 "add $16, %%"REG_d" \n\t"\
|
|
154 "mov (%%"REG_d"), %%"REG_S" \n\t"\
|
|
155 "pmulhw %%mm0, %%mm2 \n\t"\
|
|
156 "pmulhw %%mm0, %%mm5 \n\t"\
|
|
157 "paddw %%mm2, %%mm1 \n\t"\
|
|
158 "paddw %%mm5, %%mm7 \n\t"\
|
|
159 "test %%"REG_S", %%"REG_S" \n\t"\
|
|
160 " jnz 2b \n\t"\
|
|
161
|
|
162
|
|
163 #define YSCALEYUV2RGBX \
|
|
164 YSCALEYUV2PACKEDX\
|
|
165 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
|
|
166 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
|
|
167 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
|
|
168 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
|
|
169 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
|
|
170 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
|
|
171 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
|
|
172 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
|
|
173 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
|
|
174 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
|
|
175 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
|
|
176 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
|
|
177 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
|
|
178 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
|
|
179 "paddw %%mm3, %%mm4 \n\t"\
|
|
180 "movq %%mm2, %%mm0 \n\t"\
|
|
181 "movq %%mm5, %%mm6 \n\t"\
|
|
182 "movq %%mm4, %%mm3 \n\t"\
|
|
183 "punpcklwd %%mm2, %%mm2 \n\t"\
|
|
184 "punpcklwd %%mm5, %%mm5 \n\t"\
|
|
185 "punpcklwd %%mm4, %%mm4 \n\t"\
|
|
186 "paddw %%mm1, %%mm2 \n\t"\
|
|
187 "paddw %%mm1, %%mm5 \n\t"\
|
|
188 "paddw %%mm1, %%mm4 \n\t"\
|
|
189 "punpckhwd %%mm0, %%mm0 \n\t"\
|
|
190 "punpckhwd %%mm6, %%mm6 \n\t"\
|
|
191 "punpckhwd %%mm3, %%mm3 \n\t"\
|
|
192 "paddw %%mm7, %%mm0 \n\t"\
|
|
193 "paddw %%mm7, %%mm6 \n\t"\
|
|
194 "paddw %%mm7, %%mm3 \n\t"\
|
|
195 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
|
|
196 "packuswb %%mm0, %%mm2 \n\t"\
|
|
197 "packuswb %%mm6, %%mm5 \n\t"\
|
|
198 "packuswb %%mm3, %%mm4 \n\t"\
|
|
199 "pxor %%mm7, %%mm7 \n\t"
|
|
200 #if 0
|
|
201 #define FULL_YSCALEYUV2RGB \
|
|
202 "pxor %%mm7, %%mm7 \n\t"\
|
|
203 "movd %6, %%mm6 \n\t" /*yalpha1*/\
|
|
204 "punpcklwd %%mm6, %%mm6 \n\t"\
|
|
205 "punpcklwd %%mm6, %%mm6 \n\t"\
|
|
206 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
|
|
207 "punpcklwd %%mm5, %%mm5 \n\t"\
|
|
208 "punpcklwd %%mm5, %%mm5 \n\t"\
|
|
209 "xor %%"REG_a", %%"REG_a" \n\t"\
|
|
210 ASMALIGN16\
|
|
211 "1: \n\t"\
|
|
212 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
|
|
213 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
|
|
214 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
|
|
215 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
|
|
216 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
|
|
217 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
|
|
218 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
|
|
219 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
|
|
220 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
|
|
221 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
|
|
222 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
|
|
223 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
|
|
224 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
|
|
225 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
|
|
226 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
|
|
227 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
|
|
228 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
|
|
229 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
|
|
230 \
|
|
231 \
|
|
232 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
|
|
233 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
|
|
234 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
|
|
235 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
|
|
236 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
|
|
237 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
|
|
238 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
|
|
239 \
|
|
240 \
|
|
241 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
|
|
242 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
|
|
243 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
|
|
244 "paddw %%mm1, %%mm3 \n\t" /* B*/\
|
|
245 "paddw %%mm1, %%mm0 \n\t" /* R*/\
|
|
246 "packuswb %%mm3, %%mm3 \n\t"\
|
|
247 \
|
|
248 "packuswb %%mm0, %%mm0 \n\t"\
|
|
249 "paddw %%mm4, %%mm2 \n\t"\
|
|
250 "paddw %%mm2, %%mm1 \n\t" /* G*/\
|
|
251 \
|
|
252 "packuswb %%mm1, %%mm1 \n\t"
|
|
253 #endif
|
|
254
|
|
255 #define REAL_YSCALEYUV2PACKED(index, c) \
|
|
256 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
|
|
257 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
|
|
258 "psraw $3, %%mm0 \n\t"\
|
|
259 "psraw $3, %%mm1 \n\t"\
|
|
260 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
|
|
261 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
|
|
262 "xor "#index", "#index" \n\t"\
|
|
263 ASMALIGN16\
|
|
264 "1: \n\t"\
|
|
265 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
|
|
266 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
|
|
267 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
|
|
268 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
|
|
269 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
|
|
270 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
|
|
271 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
|
|
272 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
|
|
273 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
|
|
274 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
|
|
275 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
|
|
276 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
|
|
277 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
|
|
278 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
|
|
279 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
|
|
280 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
|
|
281 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
|
|
282 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
|
|
283 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
|
|
284 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
|
|
285 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
|
|
286 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
|
|
287 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
|
|
288 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
|
|
289 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
|
|
290
|
|
291 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
|
|
292
|
|
293 #define REAL_YSCALEYUV2RGB(index, c) \
|
|
294 "xor "#index", "#index" \n\t"\
|
|
295 ASMALIGN16\
|
|
296 "1: \n\t"\
|
|
297 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
|
|
298 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
|
|
299 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
|
|
300 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
|
|
301 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
|
|
302 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
|
|
303 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
|
|
304 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
|
|
305 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
|
|
306 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
|
|
307 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
|
|
308 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
|
|
309 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
|
|
310 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
|
|
311 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
|
|
312 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
|
|
313 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
|
|
314 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
|
|
315 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
|
|
316 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
|
|
317 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
|
|
318 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
|
|
319 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
|
|
320 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
|
|
321 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
|
|
322 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
|
|
323 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
|
|
324 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
|
|
325 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
|
|
326 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
|
|
327 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
|
|
328 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
|
|
329 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
|
|
330 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
|
|
331 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
|
|
332 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
|
|
333 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
|
|
334 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
|
|
335 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
|
|
336 "paddw %%mm3, %%mm4 \n\t"\
|
|
337 "movq %%mm2, %%mm0 \n\t"\
|
|
338 "movq %%mm5, %%mm6 \n\t"\
|
|
339 "movq %%mm4, %%mm3 \n\t"\
|
|
340 "punpcklwd %%mm2, %%mm2 \n\t"\
|
|
341 "punpcklwd %%mm5, %%mm5 \n\t"\
|
|
342 "punpcklwd %%mm4, %%mm4 \n\t"\
|
|
343 "paddw %%mm1, %%mm2 \n\t"\
|
|
344 "paddw %%mm1, %%mm5 \n\t"\
|
|
345 "paddw %%mm1, %%mm4 \n\t"\
|
|
346 "punpckhwd %%mm0, %%mm0 \n\t"\
|
|
347 "punpckhwd %%mm6, %%mm6 \n\t"\
|
|
348 "punpckhwd %%mm3, %%mm3 \n\t"\
|
|
349 "paddw %%mm7, %%mm0 \n\t"\
|
|
350 "paddw %%mm7, %%mm6 \n\t"\
|
|
351 "paddw %%mm7, %%mm3 \n\t"\
|
|
352 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
|
|
353 "packuswb %%mm0, %%mm2 \n\t"\
|
|
354 "packuswb %%mm6, %%mm5 \n\t"\
|
|
355 "packuswb %%mm3, %%mm4 \n\t"\
|
|
356 "pxor %%mm7, %%mm7 \n\t"
|
|
357 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
|
|
358
|
|
359 #define REAL_YSCALEYUV2PACKED1(index, c) \
|
|
360 "xor "#index", "#index" \n\t"\
|
|
361 ASMALIGN16\
|
|
362 "1: \n\t"\
|
|
363 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
|
|
364 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
|
|
365 "psraw $7, %%mm3 \n\t" \
|
|
366 "psraw $7, %%mm4 \n\t" \
|
|
367 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
|
|
368 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
|
|
369 "psraw $7, %%mm1 \n\t" \
|
|
370 "psraw $7, %%mm7 \n\t" \
|
|
371
|
|
372 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
|
|
373
|
|
374 #define REAL_YSCALEYUV2RGB1(index, c) \
|
|
375 "xor "#index", "#index" \n\t"\
|
|
376 ASMALIGN16\
|
|
377 "1: \n\t"\
|
|
378 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
|
|
379 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
|
|
380 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
|
|
381 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
|
|
382 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
|
|
383 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
|
|
384 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
|
|
385 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
|
|
386 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
|
|
387 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
|
|
388 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
|
|
389 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
|
|
390 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
|
|
391 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
|
|
392 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
|
|
393 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
|
|
394 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
|
|
395 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
|
|
396 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
|
|
397 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
|
|
398 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
|
|
399 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
|
|
400 "paddw %%mm3, %%mm4 \n\t"\
|
|
401 "movq %%mm2, %%mm0 \n\t"\
|
|
402 "movq %%mm5, %%mm6 \n\t"\
|
|
403 "movq %%mm4, %%mm3 \n\t"\
|
|
404 "punpcklwd %%mm2, %%mm2 \n\t"\
|
|
405 "punpcklwd %%mm5, %%mm5 \n\t"\
|
|
406 "punpcklwd %%mm4, %%mm4 \n\t"\
|
|
407 "paddw %%mm1, %%mm2 \n\t"\
|
|
408 "paddw %%mm1, %%mm5 \n\t"\
|
|
409 "paddw %%mm1, %%mm4 \n\t"\
|
|
410 "punpckhwd %%mm0, %%mm0 \n\t"\
|
|
411 "punpckhwd %%mm6, %%mm6 \n\t"\
|
|
412 "punpckhwd %%mm3, %%mm3 \n\t"\
|
|
413 "paddw %%mm7, %%mm0 \n\t"\
|
|
414 "paddw %%mm7, %%mm6 \n\t"\
|
|
415 "paddw %%mm7, %%mm3 \n\t"\
|
|
416 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
|
|
417 "packuswb %%mm0, %%mm2 \n\t"\
|
|
418 "packuswb %%mm6, %%mm5 \n\t"\
|
|
419 "packuswb %%mm3, %%mm4 \n\t"\
|
|
420 "pxor %%mm7, %%mm7 \n\t"
|
|
421 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
|
|
422
|
|
423 #define REAL_YSCALEYUV2PACKED1b(index, c) \
|
|
424 "xor "#index", "#index" \n\t"\
|
|
425 ASMALIGN16\
|
|
426 "1: \n\t"\
|
|
427 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
|
|
428 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
|
|
429 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
|
|
430 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
|
|
431 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
|
|
432 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
|
|
433 "psrlw $8, %%mm3 \n\t" \
|
|
434 "psrlw $8, %%mm4 \n\t" \
|
|
435 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
|
|
436 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
|
|
437 "psraw $7, %%mm1 \n\t" \
|
|
438 "psraw $7, %%mm7 \n\t"
|
|
439 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
|
|
440
|
|
441 // do vertical chrominance interpolation
|
|
442 #define REAL_YSCALEYUV2RGB1b(index, c) \
|
|
443 "xor "#index", "#index" \n\t"\
|
|
444 ASMALIGN16\
|
|
445 "1: \n\t"\
|
|
446 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
|
|
447 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
|
|
448 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
|
|
449 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
|
|
450 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
|
|
451 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
|
|
452 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
|
|
453 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
|
|
454 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
|
|
455 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
|
|
456 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
|
|
457 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
|
|
458 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
|
|
459 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
|
|
460 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
|
|
461 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
|
|
462 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
|
|
463 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
|
|
464 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
|
|
465 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
|
|
466 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
|
|
467 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
|
|
468 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
|
|
469 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
|
|
470 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
|
|
471 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
|
|
472 "paddw %%mm3, %%mm4 \n\t"\
|
|
473 "movq %%mm2, %%mm0 \n\t"\
|
|
474 "movq %%mm5, %%mm6 \n\t"\
|
|
475 "movq %%mm4, %%mm3 \n\t"\
|
|
476 "punpcklwd %%mm2, %%mm2 \n\t"\
|
|
477 "punpcklwd %%mm5, %%mm5 \n\t"\
|
|
478 "punpcklwd %%mm4, %%mm4 \n\t"\
|
|
479 "paddw %%mm1, %%mm2 \n\t"\
|
|
480 "paddw %%mm1, %%mm5 \n\t"\
|
|
481 "paddw %%mm1, %%mm4 \n\t"\
|
|
482 "punpckhwd %%mm0, %%mm0 \n\t"\
|
|
483 "punpckhwd %%mm6, %%mm6 \n\t"\
|
|
484 "punpckhwd %%mm3, %%mm3 \n\t"\
|
|
485 "paddw %%mm7, %%mm0 \n\t"\
|
|
486 "paddw %%mm7, %%mm6 \n\t"\
|
|
487 "paddw %%mm7, %%mm3 \n\t"\
|
|
488 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
|
|
489 "packuswb %%mm0, %%mm2 \n\t"\
|
|
490 "packuswb %%mm6, %%mm5 \n\t"\
|
|
491 "packuswb %%mm3, %%mm4 \n\t"\
|
|
492 "pxor %%mm7, %%mm7 \n\t"
|
|
493 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
|
|
494
|
|
495 #define REAL_WRITEBGR32(dst, dstw, index) \
|
|
496 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
|
|
497 "movq %%mm2, %%mm1 \n\t" /* B */\
|
|
498 "movq %%mm5, %%mm6 \n\t" /* R */\
|
|
499 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
|
|
500 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
|
|
501 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
|
|
502 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
|
|
503 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
|
|
504 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
|
|
505 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
|
|
506 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
|
|
507 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
|
|
508 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
|
|
509 \
|
|
510 MOVNTQ(%%mm0, (dst, index, 4))\
|
|
511 MOVNTQ(%%mm2, 8(dst, index, 4))\
|
|
512 MOVNTQ(%%mm1, 16(dst, index, 4))\
|
|
513 MOVNTQ(%%mm3, 24(dst, index, 4))\
|
|
514 \
|
|
515 "add $8, "#index" \n\t"\
|
|
516 "cmp "#dstw", "#index" \n\t"\
|
|
517 " jb 1b \n\t"
|
|
518 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
|
|
519
|
|
520 #define REAL_WRITEBGR16(dst, dstw, index) \
|
|
521 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
|
|
522 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
|
|
523 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
|
|
524 "psrlq $3, %%mm2 \n\t"\
|
|
525 \
|
|
526 "movq %%mm2, %%mm1 \n\t"\
|
|
527 "movq %%mm4, %%mm3 \n\t"\
|
|
528 \
|
|
529 "punpcklbw %%mm7, %%mm3 \n\t"\
|
|
530 "punpcklbw %%mm5, %%mm2 \n\t"\
|
|
531 "punpckhbw %%mm7, %%mm4 \n\t"\
|
|
532 "punpckhbw %%mm5, %%mm1 \n\t"\
|
|
533 \
|
|
534 "psllq $3, %%mm3 \n\t"\
|
|
535 "psllq $3, %%mm4 \n\t"\
|
|
536 \
|
|
537 "por %%mm3, %%mm2 \n\t"\
|
|
538 "por %%mm4, %%mm1 \n\t"\
|
|
539 \
|
|
540 MOVNTQ(%%mm2, (dst, index, 2))\
|
|
541 MOVNTQ(%%mm1, 8(dst, index, 2))\
|
|
542 \
|
|
543 "add $8, "#index" \n\t"\
|
|
544 "cmp "#dstw", "#index" \n\t"\
|
|
545 " jb 1b \n\t"
|
|
546 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
|
|
547
|
|
548 #define REAL_WRITEBGR15(dst, dstw, index) \
|
|
549 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
|
|
550 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
|
|
551 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
|
|
552 "psrlq $3, %%mm2 \n\t"\
|
|
553 "psrlq $1, %%mm5 \n\t"\
|
|
554 \
|
|
555 "movq %%mm2, %%mm1 \n\t"\
|
|
556 "movq %%mm4, %%mm3 \n\t"\
|
|
557 \
|
|
558 "punpcklbw %%mm7, %%mm3 \n\t"\
|
|
559 "punpcklbw %%mm5, %%mm2 \n\t"\
|
|
560 "punpckhbw %%mm7, %%mm4 \n\t"\
|
|
561 "punpckhbw %%mm5, %%mm1 \n\t"\
|
|
562 \
|
|
563 "psllq $2, %%mm3 \n\t"\
|
|
564 "psllq $2, %%mm4 \n\t"\
|
|
565 \
|
|
566 "por %%mm3, %%mm2 \n\t"\
|
|
567 "por %%mm4, %%mm1 \n\t"\
|
|
568 \
|
|
569 MOVNTQ(%%mm2, (dst, index, 2))\
|
|
570 MOVNTQ(%%mm1, 8(dst, index, 2))\
|
|
571 \
|
|
572 "add $8, "#index" \n\t"\
|
|
573 "cmp "#dstw", "#index" \n\t"\
|
|
574 " jb 1b \n\t"
|
|
575 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
|
|
576
|
|
577 #define WRITEBGR24OLD(dst, dstw, index) \
|
|
578 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
|
|
579 "movq %%mm2, %%mm1 \n\t" /* B */\
|
|
580 "movq %%mm5, %%mm6 \n\t" /* R */\
|
|
581 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
|
|
582 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
|
|
583 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
|
|
584 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
|
|
585 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
|
|
586 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
|
|
587 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
|
|
588 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
|
|
589 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
|
|
590 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
|
|
591 \
|
|
592 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
|
|
593 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
|
|
594 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
|
|
595 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
|
|
596 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
|
|
597 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
|
|
598 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
|
|
599 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
|
|
600 \
|
|
601 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
|
|
602 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
|
|
603 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
|
|
604 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
|
|
605 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
|
|
606 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
|
|
607 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
|
|
608 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
|
|
609 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
|
|
610 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
|
|
611 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
|
|
612 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
|
|
613 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
|
|
614 \
|
|
615 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
|
|
616 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
|
|
617 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
|
|
618 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
|
|
619 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
|
|
620 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
|
|
621 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
|
|
622 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
|
|
623 \
|
|
624 MOVNTQ(%%mm0, (dst))\
|
|
625 MOVNTQ(%%mm2, 8(dst))\
|
|
626 MOVNTQ(%%mm3, 16(dst))\
|
|
627 "add $24, "#dst" \n\t"\
|
|
628 \
|
|
629 "add $8, "#index" \n\t"\
|
|
630 "cmp "#dstw", "#index" \n\t"\
|
|
631 " jb 1b \n\t"
|
|
632
|
|
633 #define WRITEBGR24MMX(dst, dstw, index) \
|
|
634 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
|
|
635 "movq %%mm2, %%mm1 \n\t" /* B */\
|
|
636 "movq %%mm5, %%mm6 \n\t" /* R */\
|
|
637 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
|
|
638 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
|
|
639 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
|
|
640 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
|
|
641 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
|
|
642 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
|
|
643 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
|
|
644 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
|
|
645 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
|
|
646 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
|
|
647 \
|
|
648 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
|
|
649 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
|
|
650 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
|
|
651 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
|
|
652 \
|
|
653 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
|
|
654 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
|
|
655 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
|
|
656 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
|
|
657 \
|
|
658 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
|
|
659 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
|
|
660 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
|
|
661 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
|
|
662 \
|
|
663 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
|
|
664 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
|
|
665 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
|
|
666 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
|
|
667 MOVNTQ(%%mm0, (dst))\
|
|
668 \
|
|
669 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
|
|
670 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
|
|
671 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
|
|
672 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
|
|
673 MOVNTQ(%%mm6, 8(dst))\
|
|
674 \
|
|
675 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
|
|
676 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
|
|
677 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
|
|
678 MOVNTQ(%%mm5, 16(dst))\
|
|
679 \
|
|
680 "add $24, "#dst" \n\t"\
|
|
681 \
|
|
682 "add $8, "#index" \n\t"\
|
|
683 "cmp "#dstw", "#index" \n\t"\
|
|
684 " jb 1b \n\t"
|
|
685
|
|
686 #define WRITEBGR24MMX2(dst, dstw, index) \
|
|
687 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
|
|
688 "movq "MANGLE(M24A)", %%mm0 \n\t"\
|
|
689 "movq "MANGLE(M24C)", %%mm7 \n\t"\
|
|
690 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
|
|
691 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
|
|
692 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
|
|
693 \
|
|
694 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
|
|
695 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
|
|
696 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
|
|
697 \
|
|
698 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
|
|
699 "por %%mm1, %%mm6 \n\t"\
|
|
700 "por %%mm3, %%mm6 \n\t"\
|
|
701 MOVNTQ(%%mm6, (dst))\
|
|
702 \
|
|
703 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
|
|
704 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
|
|
705 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
|
|
706 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
|
|
707 \
|
|
708 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
|
|
709 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
|
|
710 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
|
|
711 \
|
|
712 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
|
|
713 "por %%mm3, %%mm6 \n\t"\
|
|
714 MOVNTQ(%%mm6, 8(dst))\
|
|
715 \
|
|
716 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
|
|
717 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
|
|
718 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
|
|
719 \
|
|
720 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
|
|
721 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
|
|
722 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
|
|
723 \
|
|
724 "por %%mm1, %%mm3 \n\t"\
|
|
725 "por %%mm3, %%mm6 \n\t"\
|
|
726 MOVNTQ(%%mm6, 16(dst))\
|
|
727 \
|
|
728 "add $24, "#dst" \n\t"\
|
|
729 \
|
|
730 "add $8, "#index" \n\t"\
|
|
731 "cmp "#dstw", "#index" \n\t"\
|
|
732 " jb 1b \n\t"
|
|
733
|
|
734 #ifdef HAVE_MMX2
|
|
735 #undef WRITEBGR24
|
|
736 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
|
|
737 #else
|
|
738 #undef WRITEBGR24
|
|
739 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
|
|
740 #endif
|
|
741
|
|
742 #define REAL_WRITEYUY2(dst, dstw, index) \
|
|
743 "packuswb %%mm3, %%mm3 \n\t"\
|
|
744 "packuswb %%mm4, %%mm4 \n\t"\
|
|
745 "packuswb %%mm7, %%mm1 \n\t"\
|
|
746 "punpcklbw %%mm4, %%mm3 \n\t"\
|
|
747 "movq %%mm1, %%mm7 \n\t"\
|
|
748 "punpcklbw %%mm3, %%mm1 \n\t"\
|
|
749 "punpckhbw %%mm3, %%mm7 \n\t"\
|
|
750 \
|
|
751 MOVNTQ(%%mm1, (dst, index, 2))\
|
|
752 MOVNTQ(%%mm7, 8(dst, index, 2))\
|
|
753 \
|
|
754 "add $8, "#index" \n\t"\
|
|
755 "cmp "#dstw", "#index" \n\t"\
|
|
756 " jb 1b \n\t"
|
|
757 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
|
|
758
|
|
759
|
|
760 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
|
|
761 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
|
|
762 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
|
|
763 {
|
|
764 #ifdef HAVE_MMX
|
|
765 if(uDest != NULL)
|
|
766 {
|
|
767 asm volatile(
|
|
768 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
|
|
769 :: "r" (&c->redDither),
|
|
770 "r" (uDest), "p" (chrDstW)
|
|
771 : "%"REG_a, "%"REG_d, "%"REG_S
|
|
772 );
|
|
773
|
|
774 asm volatile(
|
|
775 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
|
|
776 :: "r" (&c->redDither),
|
|
777 "r" (vDest), "p" (chrDstW)
|
|
778 : "%"REG_a, "%"REG_d, "%"REG_S
|
|
779 );
|
|
780 }
|
|
781
|
|
782 asm volatile(
|
|
783 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
|
|
784 :: "r" (&c->redDither),
|
|
785 "r" (dest), "p" (dstW)
|
|
786 : "%"REG_a, "%"REG_d, "%"REG_S
|
|
787 );
|
|
788 #else
|
|
789 #ifdef HAVE_ALTIVEC
|
|
790 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
|
|
791 chrFilter, chrSrc, chrFilterSize,
|
|
792 dest, uDest, vDest, dstW, chrDstW);
|
|
793 #else //HAVE_ALTIVEC
|
|
794 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
|
|
795 chrFilter, chrSrc, chrFilterSize,
|
|
796 dest, uDest, vDest, dstW, chrDstW);
|
|
797 #endif //!HAVE_ALTIVEC
|
|
798 #endif
|
|
799 }
|
|
800
|
|
801 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
|
|
802 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
|
|
803 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
|
|
804 {
|
|
805 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
|
|
806 chrFilter, chrSrc, chrFilterSize,
|
|
807 dest, uDest, dstW, chrDstW, dstFormat);
|
|
808 }
|
|
809
|
|
810 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
|
|
811 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
|
|
812 {
|
|
813 #ifdef HAVE_MMX
|
|
814 if(uDest != NULL)
|
|
815 {
|
|
816 asm volatile(
|
|
817 YSCALEYUV2YV121
|
|
818 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
|
|
819 "g" (-chrDstW)
|
|
820 : "%"REG_a
|
|
821 );
|
|
822
|
|
823 asm volatile(
|
|
824 YSCALEYUV2YV121
|
|
825 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
|
|
826 "g" (-chrDstW)
|
|
827 : "%"REG_a
|
|
828 );
|
|
829 }
|
|
830
|
|
831 asm volatile(
|
|
832 YSCALEYUV2YV121
|
|
833 :: "r" (lumSrc + dstW), "r" (dest + dstW),
|
|
834 "g" (-dstW)
|
|
835 : "%"REG_a
|
|
836 );
|
|
837 #else
|
|
838 int i;
|
|
839 for(i=0; i<dstW; i++)
|
|
840 {
|
|
841 int val= lumSrc[i]>>7;
|
|
842
|
|
843 if(val&256){
|
|
844 if(val<0) val=0;
|
|
845 else val=255;
|
|
846 }
|
|
847
|
|
848 dest[i]= val;
|
|
849 }
|
|
850
|
|
851 if(uDest != NULL)
|
|
852 for(i=0; i<chrDstW; i++)
|
|
853 {
|
|
854 int u=chrSrc[i]>>7;
|
|
855 int v=chrSrc[i + 2048]>>7;
|
|
856
|
|
857 if((u|v)&256){
|
|
858 if(u<0) u=0;
|
|
859 else if (u>255) u=255;
|
|
860 if(v<0) v=0;
|
|
861 else if (v>255) v=255;
|
|
862 }
|
|
863
|
|
864 uDest[i]= u;
|
|
865 vDest[i]= v;
|
|
866 }
|
|
867 #endif
|
|
868 }
|
|
869
|
|
870
|
|
871 /**
|
|
872 * vertical scale YV12 to RGB
|
|
873 */
|
|
874 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
|
|
875 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
|
|
876 uint8_t *dest, long dstW, long dstY)
|
|
877 {
|
|
878 long dummy=0;
|
|
879 switch(c->dstFormat)
|
|
880 {
|
|
881 #ifdef HAVE_MMX
|
|
882 case IMGFMT_BGR32:
|
|
883 {
|
|
884 asm volatile(
|
|
885 YSCALEYUV2RGBX
|
|
886 WRITEBGR32(%4, %5, %%REGa)
|
|
887
|
|
888 :: "r" (&c->redDither),
|
|
889 "m" (dummy), "m" (dummy), "m" (dummy),
|
|
890 "r" (dest), "m" (dstW)
|
|
891 : "%"REG_a, "%"REG_d, "%"REG_S
|
|
892 );
|
|
893 }
|
|
894 break;
|
|
895 case IMGFMT_BGR24:
|
|
896 {
|
|
897 asm volatile(
|
|
898 YSCALEYUV2RGBX
|
|
899 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
|
|
900 "add %4, %%"REG_b" \n\t"
|
|
901 WRITEBGR24(%%REGb, %5, %%REGa)
|
|
902
|
|
903 :: "r" (&c->redDither),
|
|
904 "m" (dummy), "m" (dummy), "m" (dummy),
|
|
905 "r" (dest), "m" (dstW)
|
|
906 : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
|
|
907 );
|
|
908 }
|
|
909 break;
|
|
910 case IMGFMT_BGR15:
|
|
911 {
|
|
912 asm volatile(
|
|
913 YSCALEYUV2RGBX
|
|
914 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
|
|
915 #ifdef DITHER1XBPP
|
|
916 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
|
|
917 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
|
|
918 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
|
|
919 #endif
|
|
920
|
|
921 WRITEBGR15(%4, %5, %%REGa)
|
|
922
|
|
923 :: "r" (&c->redDither),
|
|
924 "m" (dummy), "m" (dummy), "m" (dummy),
|
|
925 "r" (dest), "m" (dstW)
|
|
926 : "%"REG_a, "%"REG_d, "%"REG_S
|
|
927 );
|
|
928 }
|
|
929 break;
|
|
930 case IMGFMT_BGR16:
|
|
931 {
|
|
932 asm volatile(
|
|
933 YSCALEYUV2RGBX
|
|
934 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
|
|
935 #ifdef DITHER1XBPP
|
|
936 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
|
|
937 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
|
|
938 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
|
|
939 #endif
|
|
940
|
|
941 WRITEBGR16(%4, %5, %%REGa)
|
|
942
|
|
943 :: "r" (&c->redDither),
|
|
944 "m" (dummy), "m" (dummy), "m" (dummy),
|
|
945 "r" (dest), "m" (dstW)
|
|
946 : "%"REG_a, "%"REG_d, "%"REG_S
|
|
947 );
|
|
948 }
|
|
949 break;
|
|
950 case IMGFMT_YUY2:
|
|
951 {
|
|
952 asm volatile(
|
|
953 YSCALEYUV2PACKEDX
|
|
954 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
|
|
955
|
|
956 "psraw $3, %%mm3 \n\t"
|
|
957 "psraw $3, %%mm4 \n\t"
|
|
958 "psraw $3, %%mm1 \n\t"
|
|
959 "psraw $3, %%mm7 \n\t"
|
|
960 WRITEYUY2(%4, %5, %%REGa)
|
|
961
|
|
962 :: "r" (&c->redDither),
|
|
963 "m" (dummy), "m" (dummy), "m" (dummy),
|
|
964 "r" (dest), "m" (dstW)
|
|
965 : "%"REG_a, "%"REG_d, "%"REG_S
|
|
966 );
|
|
967 }
|
|
968 break;
|
|
969 #endif
|
|
970 default:
|
|
971 #ifdef HAVE_ALTIVEC
|
|
972 /* The following list of supported dstFormat values should
|
|
973 match what's found in the body of altivec_yuv2packedX() */
|
|
974 if(c->dstFormat==IMGFMT_ABGR || c->dstFormat==IMGFMT_BGRA ||
|
|
975 c->dstFormat==IMGFMT_BGR24 || c->dstFormat==IMGFMT_RGB24 ||
|
|
976 c->dstFormat==IMGFMT_RGBA || c->dstFormat==IMGFMT_ARGB)
|
|
977 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
|
|
978 chrFilter, chrSrc, chrFilterSize,
|
|
979 dest, dstW, dstY);
|
|
980 else
|
|
981 #endif
|
|
982 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
|
|
983 chrFilter, chrSrc, chrFilterSize,
|
|
984 dest, dstW, dstY);
|
|
985 break;
|
|
986 }
|
|
987 }
|
|
988
|
|
989 /**
|
|
990 * vertical bilinear scale YV12 to RGB
|
|
991 */
|
|
992 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
|
|
993 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
|
|
994 {
|
|
995 int yalpha1=yalpha^4095;
|
|
996 int uvalpha1=uvalpha^4095;
|
|
997 int i;
|
|
998
|
|
999 #if 0 //isn't used
|
|
1000 if(flags&SWS_FULL_CHR_H_INT)
|
|
1001 {
|
|
1002 switch(dstFormat)
|
|
1003 {
|
|
1004 #ifdef HAVE_MMX
|
|
1005 case IMGFMT_BGR32:
|
|
1006 asm volatile(
|
|
1007
|
|
1008
|
|
1009 FULL_YSCALEYUV2RGB
|
|
1010 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
|
|
1011 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
|
|
1012
|
|
1013 "movq %%mm3, %%mm1 \n\t"
|
|
1014 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
|
|
1015 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
|
|
1016
|
|
1017 MOVNTQ(%%mm3, (%4, %%REGa, 4))
|
|
1018 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
|
|
1019
|
|
1020 "add $4, %%"REG_a" \n\t"
|
|
1021 "cmp %5, %%"REG_a" \n\t"
|
|
1022 " jb 1b \n\t"
|
|
1023
|
|
1024
|
|
1025 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
|
|
1026 "m" (yalpha1), "m" (uvalpha1)
|
|
1027 : "%"REG_a
|
|
1028 );
|
|
1029 break;
|
|
1030 case IMGFMT_BGR24:
|
|
1031 asm volatile(
|
|
1032
|
|
1033 FULL_YSCALEYUV2RGB
|
|
1034
|
|
1035 // lsb ... msb
|
|
1036 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
|
|
1037 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
|
|
1038
|
|
1039 "movq %%mm3, %%mm1 \n\t"
|
|
1040 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
|
|
1041 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
|
|
1042
|
|
1043 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
|
|
1044 "psrlq $8, %%mm3 \n\t" // GR0BGR00
|
|
1045 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
|
|
1046 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
|
|
1047 "por %%mm2, %%mm3 \n\t" // BGRBGR00
|
|
1048 "movq %%mm1, %%mm2 \n\t"
|
|
1049 "psllq $48, %%mm1 \n\t" // 000000BG
|
|
1050 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
|
|
1051
|
|
1052 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
|
|
1053 "psrld $16, %%mm2 \n\t" // R000R000
|
|
1054 "psrlq $24, %%mm1 \n\t" // 0BGR0000
|
|
1055 "por %%mm2, %%mm1 \n\t" // RBGRR000
|
|
1056
|
|
1057 "mov %4, %%"REG_b" \n\t"
|
|
1058 "add %%"REG_a", %%"REG_b" \n\t"
|
|
1059
|
|
1060 #ifdef HAVE_MMX2
|
|
1061 //FIXME Alignment
|
|
1062 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
|
|
1063 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
|
|
1064 #else
|
|
1065 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
|
|
1066 "psrlq $32, %%mm3 \n\t"
|
|
1067 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
|
|
1068 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
|
|
1069 #endif
|
|
1070 "add $4, %%"REG_a" \n\t"
|
|
1071 "cmp %5, %%"REG_a" \n\t"
|
|
1072 " jb 1b \n\t"
|
|
1073
|
|
1074 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
|
|
1075 "m" (yalpha1), "m" (uvalpha1)
|
|
1076 : "%"REG_a, "%"REG_b
|
|
1077 );
|
|
1078 break;
|
|
1079 case IMGFMT_BGR15:
|
|
1080 asm volatile(
|
|
1081
|
|
1082 FULL_YSCALEYUV2RGB
|
|
1083 #ifdef DITHER1XBPP
|
|
1084 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
|
|
1085 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
|
|
1086 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
|
|
1087 #endif
|
|
1088 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
|
|
1089 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
|
|
1090 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
|
|
1091
|
|
1092 "psrlw $3, %%mm3 \n\t"
|
|
1093 "psllw $2, %%mm1 \n\t"
|
|
1094 "psllw $7, %%mm0 \n\t"
|
|
1095 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
|
|
1096 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
|
|
1097
|
|
1098 "por %%mm3, %%mm1 \n\t"
|
|
1099 "por %%mm1, %%mm0 \n\t"
|
|
1100
|
|
1101 MOVNTQ(%%mm0, (%4, %%REGa, 2))
|
|
1102
|
|
1103 "add $4, %%"REG_a" \n\t"
|
|
1104 "cmp %5, %%"REG_a" \n\t"
|
|
1105 " jb 1b \n\t"
|
|
1106
|
|
1107 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
|
|
1108 "m" (yalpha1), "m" (uvalpha1)
|
|
1109 : "%"REG_a
|
|
1110 );
|
|
1111 break;
|
|
1112 case IMGFMT_BGR16:
|
|
1113 asm volatile(
|
|
1114
|
|
1115 FULL_YSCALEYUV2RGB
|
|
1116 #ifdef DITHER1XBPP
|
|
1117 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
|
|
1118 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
|
|
1119 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
|
|
1120 #endif
|
|
1121 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
|
|
1122 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
|
|
1123 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
|
|
1124
|
|
1125 "psrlw $3, %%mm3 \n\t"
|
|
1126 "psllw $3, %%mm1 \n\t"
|
|
1127 "psllw $8, %%mm0 \n\t"
|
|
1128 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
|
|
1129 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
|
|
1130
|
|
1131 "por %%mm3, %%mm1 \n\t"
|
|
1132 "por %%mm1, %%mm0 \n\t"
|
|
1133
|
|
1134 MOVNTQ(%%mm0, (%4, %%REGa, 2))
|
|
1135
|
|
1136 "add $4, %%"REG_a" \n\t"
|
|
1137 "cmp %5, %%"REG_a" \n\t"
|
|
1138 " jb 1b \n\t"
|
|
1139
|
|
1140 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
|
|
1141 "m" (yalpha1), "m" (uvalpha1)
|
|
1142 : "%"REG_a
|
|
1143 );
|
|
1144 break;
|
|
1145 #endif
|
|
1146 case IMGFMT_RGB32:
|
|
1147 #ifndef HAVE_MMX
|
|
1148 case IMGFMT_BGR32:
|
|
1149 #endif
|
|
1150 if(dstFormat==IMGFMT_BGR32)
|
|
1151 {
|
|
1152 int i;
|
|
1153 #ifdef WORDS_BIGENDIAN
|
|
1154 dest++;
|
|
1155 #endif
|
|
1156 for(i=0;i<dstW;i++){
|
|
1157 // vertical linear interpolation && yuv2rgb in a single step:
|
|
1158 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
|
|
1159 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
|
|
1160 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
|
|
1161 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
|
|
1162 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
|
|
1163 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
|
|
1164 dest+= 4;
|
|
1165 }
|
|
1166 }
|
|
1167 else if(dstFormat==IMGFMT_BGR24)
|
|
1168 {
|
|
1169 int i;
|
|
1170 for(i=0;i<dstW;i++){
|
|
1171 // vertical linear interpolation && yuv2rgb in a single step:
|
|
1172 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
|
|
1173 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
|
|
1174 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
|
|
1175 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
|
|
1176 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
|
|
1177 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
|
|
1178 dest+= 3;
|
|
1179 }
|
|
1180 }
|
|
1181 else if(dstFormat==IMGFMT_BGR16)
|
|
1182 {
|
|
1183 int i;
|
|
1184 for(i=0;i<dstW;i++){
|
|
1185 // vertical linear interpolation && yuv2rgb in a single step:
|
|
1186 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
|
|
1187 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
|
|
1188 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
|
|
1189
|
|
1190 ((uint16_t*)dest)[i] =
|
|
1191 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
|
|
1192 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
|
|
1193 clip_table16r[(Y + yuvtab_3343[V]) >>13];
|
|
1194 }
|
|
1195 }
|
|
1196 else if(dstFormat==IMGFMT_BGR15)
|
|
1197 {
|
|
1198 int i;
|
|
1199 for(i=0;i<dstW;i++){
|
|
1200 // vertical linear interpolation && yuv2rgb in a single step:
|
|
1201 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
|
|
1202 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
|
|
1203 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
|
|
1204
|
|
1205 ((uint16_t*)dest)[i] =
|
|
1206 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
|
|
1207 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
|
|
1208 clip_table15r[(Y + yuvtab_3343[V]) >>13];
|
|
1209 }
|
|
1210 }
|
|
1211 }//FULL_UV_IPOL
|
|
1212 else
|
|
1213 {
|
|
1214 #endif // if 0
|
|
1215 #ifdef HAVE_MMX
|
|
1216 switch(c->dstFormat)
|
|
1217 {
|
|
1218 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
|
|
1219 case IMGFMT_BGR32:
|
|
1220 asm volatile(
|
|
1221 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
|
|
1222 "mov %4, %%"REG_b" \n\t"
|
|
1223 "push %%"REG_BP" \n\t"
|
|
1224 YSCALEYUV2RGB(%%REGBP, %5)
|
|
1225 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
|
|
1226 "pop %%"REG_BP" \n\t"
|
|
1227 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
|
|
1228
|
|
1229 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
|
|
1230 "a" (&c->redDither)
|
|
1231 );
|
|
1232 return;
|
|
1233 case IMGFMT_BGR24:
|
|
1234 asm volatile(
|
|
1235 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
|
|
1236 "mov %4, %%"REG_b" \n\t"
|
|
1237 "push %%"REG_BP" \n\t"
|
|
1238 YSCALEYUV2RGB(%%REGBP, %5)
|
|
1239 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
|
|
1240 "pop %%"REG_BP" \n\t"
|
|
1241 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
|
|
1242 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
|
|
1243 "a" (&c->redDither)
|
|
1244 );
|
|
1245 return;
|
|
1246 case IMGFMT_BGR15:
|
|
1247 asm volatile(
|
|
1248 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
|
|
1249 "mov %4, %%"REG_b" \n\t"
|
|
1250 "push %%"REG_BP" \n\t"
|
|
1251 YSCALEYUV2RGB(%%REGBP, %5)
|
|
1252 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
|
|
1253 #ifdef DITHER1XBPP
|
|
1254 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
|
|
1255 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
|
|
1256 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
|
|
1257 #endif
|
|
1258
|
|
1259 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
|
|
1260 "pop %%"REG_BP" \n\t"
|
|
1261 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
|
|
1262
|
|
1263 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
|
|
1264 "a" (&c->redDither)
|
|
1265 );
|
|
1266 return;
|
|
1267 case IMGFMT_BGR16:
|
|
1268 asm volatile(
|
|
1269 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
|
|
1270 "mov %4, %%"REG_b" \n\t"
|
|
1271 "push %%"REG_BP" \n\t"
|
|
1272 YSCALEYUV2RGB(%%REGBP, %5)
|
|
1273 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
|
|
1274 #ifdef DITHER1XBPP
|
|
1275 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
|
|
1276 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
|
|
1277 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
|
|
1278 #endif
|
|
1279
|
|
1280 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
|
|
1281 "pop %%"REG_BP" \n\t"
|
|
1282 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
|
|
1283 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
|
|
1284 "a" (&c->redDither)
|
|
1285 );
|
|
1286 return;
|
|
1287 case IMGFMT_YUY2:
|
|
1288 asm volatile(
|
|
1289 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
|
|
1290 "mov %4, %%"REG_b" \n\t"
|
|
1291 "push %%"REG_BP" \n\t"
|
|
1292 YSCALEYUV2PACKED(%%REGBP, %5)
|
|
1293 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
|
|
1294 "pop %%"REG_BP" \n\t"
|
|
1295 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
|
|
1296 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
|
|
1297 "a" (&c->redDither)
|
|
1298 );
|
|
1299 return;
|
|
1300 default: break;
|
|
1301 }
|
|
1302 #endif //HAVE_MMX
|
|
1303 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
|
|
1304 }
|
|
1305
|
|
1306 /**
|
|
1307 * YV12 to RGB without scaling or interpolating
|
|
1308 */
|
|
1309 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
|
|
1310 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
|
|
1311 {
|
|
1312 const int yalpha1=0;
|
|
1313 int i;
|
|
1314
|
|
1315 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
|
|
1316 const int yalpha= 4096; //FIXME ...
|
|
1317
|
|
1318 if(flags&SWS_FULL_CHR_H_INT)
|
|
1319 {
|
|
1320 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
|
|
1321 return;
|
|
1322 }
|
|
1323
|
|
1324 #ifdef HAVE_MMX
|
|
1325 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
|
|
1326 {
|
|
1327 switch(dstFormat)
|
|
1328 {
|
|
1329 case IMGFMT_BGR32:
|
|
1330 asm volatile(
|
|
1331 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
|
|
1332 "mov %4, %%"REG_b" \n\t"
|
|
1333 "push %%"REG_BP" \n\t"
|
|
1334 YSCALEYUV2RGB1(%%REGBP, %5)
|
|
1335 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
|
|
1336 "pop %%"REG_BP" \n\t"
|
|
1337 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
|
|
1338
|
|
1339 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
|
|
1340 "a" (&c->redDither)
|
|
1341 );
|
|
1342 return;
|
|
1343 case IMGFMT_BGR24:
|
|
1344 asm volatile(
|
|
1345 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
|
|
1346 "mov %4, %%"REG_b" \n\t"
|
|
1347 "push %%"REG_BP" \n\t"
|
|
1348 YSCALEYUV2RGB1(%%REGBP, %5)
|
|
1349 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
|
|
1350 "pop %%"REG_BP" \n\t"
|
|
1351 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
|
|
1352
|
|
1353 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
|
|
1354 "a" (&c->redDither)
|
|
1355 );
|
|
1356 return;
|
|
1357 case IMGFMT_BGR15:
|
|
1358 asm volatile(
|
|
1359 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
|
|
1360 "mov %4, %%"REG_b" \n\t"
|
|
1361 "push %%"REG_BP" \n\t"
|
|
1362 YSCALEYUV2RGB1(%%REGBP, %5)
|
|
1363 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
|
|
1364 #ifdef DITHER1XBPP
|
|
1365 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
|
|
1366 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
|
|
1367 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
|
|
1368 #endif
|
|
1369 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
|
|
1370 "pop %%"REG_BP" \n\t"
|
|
1371 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
|
|
1372
|
|
1373 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
|
|
1374 "a" (&c->redDither)
|
|
1375 );
|
|
1376 return;
|
|
1377 case IMGFMT_BGR16:
|
|
1378 asm volatile(
|
|
1379 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
|
|
1380 "mov %4, %%"REG_b" \n\t"
|
|
1381 "push %%"REG_BP" \n\t"
|
|
1382 YSCALEYUV2RGB1(%%REGBP, %5)
|
|
1383 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
|
|
1384 #ifdef DITHER1XBPP
|
|
1385 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
|
|
1386 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
|
|
1387 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
|
|
1388 #endif
|
|
1389
|
|
1390 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
|
|
1391 "pop %%"REG_BP" \n\t"
|
|
1392 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
|
|
1393
|
|
1394 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
|
|
1395 "a" (&c->redDither)
|
|
1396 );
|
|
1397 return;
|
|
1398 case IMGFMT_YUY2:
|
|
1399 asm volatile(
|
|
1400 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
|
|
1401 "mov %4, %%"REG_b" \n\t"
|
|
1402 "push %%"REG_BP" \n\t"
|
|
1403 YSCALEYUV2PACKED1(%%REGBP, %5)
|
|
1404 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
|
|
1405 "pop %%"REG_BP" \n\t"
|
|
1406 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
|
|
1407
|
|
1408 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
|
|
1409 "a" (&c->redDither)
|
|
1410 );
|
|
1411 return;
|
|
1412 }
|
|
1413 }
|
|
1414 else
|
|
1415 {
|
|
1416 switch(dstFormat)
|
|
1417 {
|
|
1418 case IMGFMT_BGR32:
|
|
1419 asm volatile(
|
|
1420 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
|
|
1421 "mov %4, %%"REG_b" \n\t"
|
|
1422 "push %%"REG_BP" \n\t"
|
|
1423 YSCALEYUV2RGB1b(%%REGBP, %5)
|
|
1424 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
|
|
1425 "pop %%"REG_BP" \n\t"
|
|
1426 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
|
|
1427
|
|
1428 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
|
|
1429 "a" (&c->redDither)
|
|
1430 );
|
|
1431 return;
|
|
1432 case IMGFMT_BGR24:
|
|
1433 asm volatile(
|
|
1434 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
|
|
1435 "mov %4, %%"REG_b" \n\t"
|
|
1436 "push %%"REG_BP" \n\t"
|
|
1437 YSCALEYUV2RGB1b(%%REGBP, %5)
|
|
1438 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
|
|
1439 "pop %%"REG_BP" \n\t"
|
|
1440 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
|
|
1441
|
|
1442 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
|
|
1443 "a" (&c->redDither)
|
|
1444 );
|
|
1445 return;
|
|
1446 case IMGFMT_BGR15:
|
|
1447 asm volatile(
|
|
1448 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
|
|
1449 "mov %4, %%"REG_b" \n\t"
|
|
1450 "push %%"REG_BP" \n\t"
|
|
1451 YSCALEYUV2RGB1b(%%REGBP, %5)
|
|
1452 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
|
|
1453 #ifdef DITHER1XBPP
|
|
1454 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
|
|
1455 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
|
|
1456 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
|
|
1457 #endif
|
|
1458 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
|
|
1459 "pop %%"REG_BP" \n\t"
|
|
1460 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
|
|
1461
|
|
1462 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
|
|
1463 "a" (&c->redDither)
|
|
1464 );
|
|
1465 return;
|
|
1466 case IMGFMT_BGR16:
|
|
1467 asm volatile(
|
|
1468 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
|
|
1469 "mov %4, %%"REG_b" \n\t"
|
|
1470 "push %%"REG_BP" \n\t"
|
|
1471 YSCALEYUV2RGB1b(%%REGBP, %5)
|
|
1472 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
|
|
1473 #ifdef DITHER1XBPP
|
|
1474 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
|
|
1475 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
|
|
1476 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
|
|
1477 #endif
|
|
1478
|
|
1479 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
|
|
1480 "pop %%"REG_BP" \n\t"
|
|
1481 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
|
|
1482
|
|
1483 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
|
|
1484 "a" (&c->redDither)
|
|
1485 );
|
|
1486 return;
|
|
1487 case IMGFMT_YUY2:
|
|
1488 asm volatile(
|
|
1489 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
|
|
1490 "mov %4, %%"REG_b" \n\t"
|
|
1491 "push %%"REG_BP" \n\t"
|
|
1492 YSCALEYUV2PACKED1b(%%REGBP, %5)
|
|
1493 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
|
|
1494 "pop %%"REG_BP" \n\t"
|
|
1495 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
|
|
1496
|
|
1497 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
|
|
1498 "a" (&c->redDither)
|
|
1499 );
|
|
1500 return;
|
|
1501 }
|
|
1502 }
|
|
1503 #endif
|
|
1504 if( uvalpha < 2048 )
|
|
1505 {
|
|
1506 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
|
|
1507 }else{
|
|
1508 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
|
|
1509 }
|
|
1510 }
|
|
1511
|
|
1512 //FIXME yuy2* can read upto 7 samples to much
|
|
1513
|
|
1514 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
|
|
1515 {
|
|
1516 #ifdef HAVE_MMX
|
|
1517 asm volatile(
|
|
1518 "movq "MANGLE(bm01010101)", %%mm2\n\t"
|
|
1519 "mov %0, %%"REG_a" \n\t"
|
|
1520 "1: \n\t"
|
|
1521 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
|
|
1522 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
|
|
1523 "pand %%mm2, %%mm0 \n\t"
|
|
1524 "pand %%mm2, %%mm1 \n\t"
|
|
1525 "packuswb %%mm1, %%mm0 \n\t"
|
|
1526 "movq %%mm0, (%2, %%"REG_a") \n\t"
|
|
1527 "add $8, %%"REG_a" \n\t"
|
|
1528 " js 1b \n\t"
|
|
1529 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
|
|
1530 : "%"REG_a
|
|
1531 );
|
|
1532 #else
|
|
1533 int i;
|
|
1534 for(i=0; i<width; i++)
|
|
1535 dst[i]= src[2*i];
|
|
1536 #endif
|
|
1537 }
|
|
1538
|
|
1539 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
|
|
1540 {
|
|
1541 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
|
1542 asm volatile(
|
|
1543 "movq "MANGLE(bm01010101)", %%mm4\n\t"
|
|
1544 "mov %0, %%"REG_a" \n\t"
|
|
1545 "1: \n\t"
|
|
1546 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
|
|
1547 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
|
|
1548 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
|
|
1549 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
|
|
1550 PAVGB(%%mm2, %%mm0)
|
|
1551 PAVGB(%%mm3, %%mm1)
|
|
1552 "psrlw $8, %%mm0 \n\t"
|
|
1553 "psrlw $8, %%mm1 \n\t"
|
|
1554 "packuswb %%mm1, %%mm0 \n\t"
|
|
1555 "movq %%mm0, %%mm1 \n\t"
|
|
1556 "psrlw $8, %%mm0 \n\t"
|
|
1557 "pand %%mm4, %%mm1 \n\t"
|
|
1558 "packuswb %%mm0, %%mm0 \n\t"
|
|
1559 "packuswb %%mm1, %%mm1 \n\t"
|
|
1560 "movd %%mm0, (%4, %%"REG_a") \n\t"
|
|
1561 "movd %%mm1, (%3, %%"REG_a") \n\t"
|
|
1562 "add $4, %%"REG_a" \n\t"
|
|
1563 " js 1b \n\t"
|
|
1564 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
|
|
1565 : "%"REG_a
|
|
1566 );
|
|
1567 #else
|
|
1568 int i;
|
|
1569 for(i=0; i<width; i++)
|
|
1570 {
|
|
1571 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
|
|
1572 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
|
|
1573 }
|
|
1574 #endif
|
|
1575 }
|
|
1576
|
|
1577 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
|
|
1578 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
|
|
1579 {
|
|
1580 #ifdef HAVE_MMX
|
|
1581 asm volatile(
|
|
1582 "mov %0, %%"REG_a" \n\t"
|
|
1583 "1: \n\t"
|
|
1584 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
|
|
1585 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
|
|
1586 "psrlw $8, %%mm0 \n\t"
|
|
1587 "psrlw $8, %%mm1 \n\t"
|
|
1588 "packuswb %%mm1, %%mm0 \n\t"
|
|
1589 "movq %%mm0, (%2, %%"REG_a") \n\t"
|
|
1590 "add $8, %%"REG_a" \n\t"
|
|
1591 " js 1b \n\t"
|
|
1592 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
|
|
1593 : "%"REG_a
|
|
1594 );
|
|
1595 #else
|
|
1596 int i;
|
|
1597 for(i=0; i<width; i++)
|
|
1598 dst[i]= src[2*i+1];
|
|
1599 #endif
|
|
1600 }
|
|
1601
|
|
1602 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
|
|
1603 {
|
|
1604 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
|
1605 asm volatile(
|
|
1606 "movq "MANGLE(bm01010101)", %%mm4\n\t"
|
|
1607 "mov %0, %%"REG_a" \n\t"
|
|
1608 "1: \n\t"
|
|
1609 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
|
|
1610 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
|
|
1611 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
|
|
1612 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
|
|
1613 PAVGB(%%mm2, %%mm0)
|
|
1614 PAVGB(%%mm3, %%mm1)
|
|
1615 "pand %%mm4, %%mm0 \n\t"
|
|
1616 "pand %%mm4, %%mm1 \n\t"
|
|
1617 "packuswb %%mm1, %%mm0 \n\t"
|
|
1618 "movq %%mm0, %%mm1 \n\t"
|
|
1619 "psrlw $8, %%mm0 \n\t"
|
|
1620 "pand %%mm4, %%mm1 \n\t"
|
|
1621 "packuswb %%mm0, %%mm0 \n\t"
|
|
1622 "packuswb %%mm1, %%mm1 \n\t"
|
|
1623 "movd %%mm0, (%4, %%"REG_a") \n\t"
|
|
1624 "movd %%mm1, (%3, %%"REG_a") \n\t"
|
|
1625 "add $4, %%"REG_a" \n\t"
|
|
1626 " js 1b \n\t"
|
|
1627 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
|
|
1628 : "%"REG_a
|
|
1629 );
|
|
1630 #else
|
|
1631 int i;
|
|
1632 for(i=0; i<width; i++)
|
|
1633 {
|
|
1634 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
|
|
1635 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
|
|
1636 }
|
|
1637 #endif
|
|
1638 }
|
|
1639
|
|
1640 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
|
|
1641 {
|
|
1642 int i;
|
|
1643 for(i=0; i<width; i++)
|
|
1644 {
|
|
1645 int b= ((uint32_t*)src)[i]&0xFF;
|
|
1646 int g= (((uint32_t*)src)[i]>>8)&0xFF;
|
|
1647 int r= (((uint32_t*)src)[i]>>16)&0xFF;
|
|
1648
|
|
1649 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
|
|
1650 }
|
|
1651 }
|
|
1652
|
|
1653 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
|
|
1654 {
|
|
1655 int i;
|
|
1656 for(i=0; i<width; i++)
|
|
1657 {
|
|
1658 const int a= ((uint32_t*)src1)[2*i+0];
|
|
1659 const int e= ((uint32_t*)src1)[2*i+1];
|
|
1660 const int c= ((uint32_t*)src2)[2*i+0];
|
|
1661 const int d= ((uint32_t*)src2)[2*i+1];
|
|
1662 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
|
|
1663 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
|
|
1664 const int b= l&0x3FF;
|
|
1665 const int g= h>>8;
|
|
1666 const int r= l>>16;
|
|
1667
|
|
1668 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
|
|
1669 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
|
|
1670 }
|
|
1671 }
|
|
1672
|
|
1673 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
|
|
1674 {
|
|
1675 #ifdef HAVE_MMX
|
|
1676 asm volatile(
|
|
1677 "mov %2, %%"REG_a" \n\t"
|
|
1678 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
|
|
1679 "movq "MANGLE(w1111)", %%mm5 \n\t"
|
|
1680 "pxor %%mm7, %%mm7 \n\t"
|
|
1681 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
|
|
1682 ASMALIGN16
|
|
1683 "1: \n\t"
|
|
1684 PREFETCH" 64(%0, %%"REG_b") \n\t"
|
|
1685 "movd (%0, %%"REG_b"), %%mm0 \n\t"
|
|
1686 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
|
|
1687 "punpcklbw %%mm7, %%mm0 \n\t"
|
|
1688 "punpcklbw %%mm7, %%mm1 \n\t"
|
|
1689 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
|
|
1690 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
|
|
1691 "punpcklbw %%mm7, %%mm2 \n\t"
|
|
1692 "punpcklbw %%mm7, %%mm3 \n\t"
|
|
1693 "pmaddwd %%mm6, %%mm0 \n\t"
|
|
1694 "pmaddwd %%mm6, %%mm1 \n\t"
|
|
1695 "pmaddwd %%mm6, %%mm2 \n\t"
|
|
1696 "pmaddwd %%mm6, %%mm3 \n\t"
|
|
1697 #ifndef FAST_BGR2YV12
|
|
1698 "psrad $8, %%mm0 \n\t"
|
|
1699 "psrad $8, %%mm1 \n\t"
|
|
1700 "psrad $8, %%mm2 \n\t"
|
|
1701 "psrad $8, %%mm3 \n\t"
|
|
1702 #endif
|
|
1703 "packssdw %%mm1, %%mm0 \n\t"
|
|
1704 "packssdw %%mm3, %%mm2 \n\t"
|
|
1705 "pmaddwd %%mm5, %%mm0 \n\t"
|
|
1706 "pmaddwd %%mm5, %%mm2 \n\t"
|
|
1707 "packssdw %%mm2, %%mm0 \n\t"
|
|
1708 "psraw $7, %%mm0 \n\t"
|
|
1709
|
|
1710 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
|
|
1711 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
|
|
1712 "punpcklbw %%mm7, %%mm4 \n\t"
|
|
1713 "punpcklbw %%mm7, %%mm1 \n\t"
|
|
1714 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
|
|
1715 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
|
|
1716 "punpcklbw %%mm7, %%mm2 \n\t"
|
|
1717 "punpcklbw %%mm7, %%mm3 \n\t"
|
|
1718 "pmaddwd %%mm6, %%mm4 \n\t"
|
|
1719 "pmaddwd %%mm6, %%mm1 \n\t"
|
|
1720 "pmaddwd %%mm6, %%mm2 \n\t"
|
|
1721 "pmaddwd %%mm6, %%mm3 \n\t"
|
|
1722 #ifndef FAST_BGR2YV12
|
|
1723 "psrad $8, %%mm4 \n\t"
|
|
1724 "psrad $8, %%mm1 \n\t"
|
|
1725 "psrad $8, %%mm2 \n\t"
|
|
1726 "psrad $8, %%mm3 \n\t"
|
|
1727 #endif
|
|
1728 "packssdw %%mm1, %%mm4 \n\t"
|
|
1729 "packssdw %%mm3, %%mm2 \n\t"
|
|
1730 "pmaddwd %%mm5, %%mm4 \n\t"
|
|
1731 "pmaddwd %%mm5, %%mm2 \n\t"
|
|
1732 "add $24, %%"REG_b" \n\t"
|
|
1733 "packssdw %%mm2, %%mm4 \n\t"
|
|
1734 "psraw $7, %%mm4 \n\t"
|
|
1735
|
|
1736 "packuswb %%mm4, %%mm0 \n\t"
|
|
1737 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
|
|
1738
|
|
1739 "movq %%mm0, (%1, %%"REG_a") \n\t"
|
|
1740 "add $8, %%"REG_a" \n\t"
|
|
1741 " js 1b \n\t"
|
|
1742 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
|
|
1743 : "%"REG_a, "%"REG_b
|
|
1744 );
|
|
1745 #else
|
|
1746 int i;
|
|
1747 for(i=0; i<width; i++)
|
|
1748 {
|
|
1749 int b= src[i*3+0];
|
|
1750 int g= src[i*3+1];
|
|
1751 int r= src[i*3+2];
|
|
1752
|
|
1753 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
|
|
1754 }
|
|
1755 #endif
|
|
1756 }
|
|
1757
|
|
1758 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
|
|
1759 {
|
|
1760 #ifdef HAVE_MMX
|
|
1761 asm volatile(
|
|
1762 "mov %4, %%"REG_a" \n\t"
|
|
1763 "movq "MANGLE(w1111)", %%mm5 \n\t"
|
|
1764 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
|
|
1765 "pxor %%mm7, %%mm7 \n\t"
|
|
1766 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t"
|
|
1767 "add %%"REG_b", %%"REG_b" \n\t"
|
|
1768 ASMALIGN16
|
|
1769 "1: \n\t"
|
|
1770 PREFETCH" 64(%0, %%"REG_b") \n\t"
|
|
1771 PREFETCH" 64(%1, %%"REG_b") \n\t"
|
|
1772 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
|
1773 "movq (%0, %%"REG_b"), %%mm0 \n\t"
|
|
1774 "movq (%1, %%"REG_b"), %%mm1 \n\t"
|
|
1775 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
|
|
1776 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
|
|
1777 PAVGB(%%mm1, %%mm0)
|
|
1778 PAVGB(%%mm3, %%mm2)
|
|
1779 "movq %%mm0, %%mm1 \n\t"
|
|
1780 "movq %%mm2, %%mm3 \n\t"
|
|
1781 "psrlq $24, %%mm0 \n\t"
|
|
1782 "psrlq $24, %%mm2 \n\t"
|
|
1783 PAVGB(%%mm1, %%mm0)
|
|
1784 PAVGB(%%mm3, %%mm2)
|
|
1785 "punpcklbw %%mm7, %%mm0 \n\t"
|
|
1786 "punpcklbw %%mm7, %%mm2 \n\t"
|
|
1787 #else
|
|
1788 "movd (%0, %%"REG_b"), %%mm0 \n\t"
|
|
1789 "movd (%1, %%"REG_b"), %%mm1 \n\t"
|
|
1790 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
|
|
1791 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
|
|
1792 "punpcklbw %%mm7, %%mm0 \n\t"
|
|
1793 "punpcklbw %%mm7, %%mm1 \n\t"
|
|
1794 "punpcklbw %%mm7, %%mm2 \n\t"
|
|
1795 "punpcklbw %%mm7, %%mm3 \n\t"
|
|
1796 "paddw %%mm1, %%mm0 \n\t"
|
|
1797 "paddw %%mm3, %%mm2 \n\t"
|
|
1798 "paddw %%mm2, %%mm0 \n\t"
|
|
1799 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
|
|
1800 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
|
|
1801 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
|
|
1802 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
|
|
1803 "punpcklbw %%mm7, %%mm4 \n\t"
|
|
1804 "punpcklbw %%mm7, %%mm1 \n\t"
|
|
1805 "punpcklbw %%mm7, %%mm2 \n\t"
|
|
1806 "punpcklbw %%mm7, %%mm3 \n\t"
|
|
1807 "paddw %%mm1, %%mm4 \n\t"
|
|
1808 "paddw %%mm3, %%mm2 \n\t"
|
|
1809 "paddw %%mm4, %%mm2 \n\t"
|
|
1810 "psrlw $2, %%mm0 \n\t"
|
|
1811 "psrlw $2, %%mm2 \n\t"
|
|
1812 #endif
|
|
1813 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
|
|
1814 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
|
|
1815
|
|
1816 "pmaddwd %%mm0, %%mm1 \n\t"
|
|
1817 "pmaddwd %%mm2, %%mm3 \n\t"
|
|
1818 "pmaddwd %%mm6, %%mm0 \n\t"
|
|
1819 "pmaddwd %%mm6, %%mm2 \n\t"
|
|
1820 #ifndef FAST_BGR2YV12
|
|
1821 "psrad $8, %%mm0 \n\t"
|
|
1822 "psrad $8, %%mm1 \n\t"
|
|
1823 "psrad $8, %%mm2 \n\t"
|
|
1824 "psrad $8, %%mm3 \n\t"
|
|
1825 #endif
|
|
1826 "packssdw %%mm2, %%mm0 \n\t"
|
|
1827 "packssdw %%mm3, %%mm1 \n\t"
|
|
1828 "pmaddwd %%mm5, %%mm0 \n\t"
|
|
1829 "pmaddwd %%mm5, %%mm1 \n\t"
|
|
1830 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
|
|
1831 "psraw $7, %%mm0 \n\t"
|
|
1832
|
|
1833 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
|
|
1834 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
|
|
1835 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
|
|
1836 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
|
|
1837 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
|
|
1838 PAVGB(%%mm1, %%mm4)
|
|
1839 PAVGB(%%mm3, %%mm2)
|
|
1840 "movq %%mm4, %%mm1 \n\t"
|
|
1841 "movq %%mm2, %%mm3 \n\t"
|
|
1842 "psrlq $24, %%mm4 \n\t"
|
|
1843 "psrlq $24, %%mm2 \n\t"
|
|
1844 PAVGB(%%mm1, %%mm4)
|
|
1845 PAVGB(%%mm3, %%mm2)
|
|
1846 "punpcklbw %%mm7, %%mm4 \n\t"
|
|
1847 "punpcklbw %%mm7, %%mm2 \n\t"
|
|
1848 #else
|
|
1849 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
|
|
1850 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
|
|
1851 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
|
|
1852 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
|
|
1853 "punpcklbw %%mm7, %%mm4 \n\t"
|
|
1854 "punpcklbw %%mm7, %%mm1 \n\t"
|
|
1855 "punpcklbw %%mm7, %%mm2 \n\t"
|
|
1856 "punpcklbw %%mm7, %%mm3 \n\t"
|
|
1857 "paddw %%mm1, %%mm4 \n\t"
|
|
1858 "paddw %%mm3, %%mm2 \n\t"
|
|
1859 "paddw %%mm2, %%mm4 \n\t"
|
|
1860 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
|
|
1861 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
|
|
1862 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
|
|
1863 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
|
|
1864 "punpcklbw %%mm7, %%mm5 \n\t"
|
|
1865 "punpcklbw %%mm7, %%mm1 \n\t"
|
|
1866 "punpcklbw %%mm7, %%mm2 \n\t"
|
|
1867 "punpcklbw %%mm7, %%mm3 \n\t"
|
|
1868 "paddw %%mm1, %%mm5 \n\t"
|
|
1869 "paddw %%mm3, %%mm2 \n\t"
|
|
1870 "paddw %%mm5, %%mm2 \n\t"
|
|
1871 "movq "MANGLE(w1111)", %%mm5 \n\t"
|
|
1872 "psrlw $2, %%mm4 \n\t"
|
|
1873 "psrlw $2, %%mm2 \n\t"
|
|
1874 #endif
|
|
1875 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
|
|
1876 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
|
|
1877
|
|
1878 "pmaddwd %%mm4, %%mm1 \n\t"
|
|
1879 "pmaddwd %%mm2, %%mm3 \n\t"
|
|
1880 "pmaddwd %%mm6, %%mm4 \n\t"
|
|
1881 "pmaddwd %%mm6, %%mm2 \n\t"
|
|
1882 #ifndef FAST_BGR2YV12
|
|
1883 "psrad $8, %%mm4 \n\t"
|
|
1884 "psrad $8, %%mm1 \n\t"
|
|
1885 "psrad $8, %%mm2 \n\t"
|
|
1886 "psrad $8, %%mm3 \n\t"
|
|
1887 #endif
|
|
1888 "packssdw %%mm2, %%mm4 \n\t"
|
|
1889 "packssdw %%mm3, %%mm1 \n\t"
|
|
1890 "pmaddwd %%mm5, %%mm4 \n\t"
|
|
1891 "pmaddwd %%mm5, %%mm1 \n\t"
|
|
1892 "add $24, %%"REG_b" \n\t"
|
|
1893 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
|
|
1894 "psraw $7, %%mm4 \n\t"
|
|
1895
|
|
1896 "movq %%mm0, %%mm1 \n\t"
|
|
1897 "punpckldq %%mm4, %%mm0 \n\t"
|
|
1898 "punpckhdq %%mm4, %%mm1 \n\t"
|
|
1899 "packsswb %%mm1, %%mm0 \n\t"
|
|
1900 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
|
|
1901
|
|
1902 "movd %%mm0, (%2, %%"REG_a") \n\t"
|
|
1903 "punpckhdq %%mm0, %%mm0 \n\t"
|
|
1904 "movd %%mm0, (%3, %%"REG_a") \n\t"
|
|
1905 "add $4, %%"REG_a" \n\t"
|
|
1906 " js 1b \n\t"
|
|
1907 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
|
|
1908 : "%"REG_a, "%"REG_b
|
|
1909 );
|
|
1910 #else
|
|
1911 int i;
|
|
1912 for(i=0; i<width; i++)
|
|
1913 {
|
|
1914 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
|
|
1915 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
|
|
1916 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
|
|
1917
|
|
1918 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
|
|
1919 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
|
|
1920 }
|
|
1921 #endif
|
|
1922 }
|
|
1923
|
|
1924 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
|
|
1925 {
|
|
1926 int i;
|
|
1927 for(i=0; i<width; i++)
|
|
1928 {
|
|
1929 int d= ((uint16_t*)src)[i];
|
|
1930 int b= d&0x1F;
|
|
1931 int g= (d>>5)&0x3F;
|
|
1932 int r= (d>>11)&0x1F;
|
|
1933
|
|
1934 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
|
|
1935 }
|
|
1936 }
|
|
1937
|
|
1938 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
|
|
1939 {
|
|
1940 int i;
|
|
1941 for(i=0; i<width; i++)
|
|
1942 {
|
|
1943 int d0= ((uint32_t*)src1)[i];
|
|
1944 int d1= ((uint32_t*)src2)[i];
|
|
1945
|
|
1946 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
|
|
1947 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
|
|
1948
|
|
1949 int dh2= (dh>>11) + (dh<<21);
|
|
1950 int d= dh2 + dl;
|
|
1951
|
|
1952 int b= d&0x7F;
|
|
1953 int r= (d>>11)&0x7F;
|
|
1954 int g= d>>21;
|
|
1955 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
|
|
1956 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
|
|
1957 }
|
|
1958 }
|
|
1959
|
|
1960 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
|
|
1961 {
|
|
1962 int i;
|
|
1963 for(i=0; i<width; i++)
|
|
1964 {
|
|
1965 int d= ((uint16_t*)src)[i];
|
|
1966 int b= d&0x1F;
|
|
1967 int g= (d>>5)&0x1F;
|
|
1968 int r= (d>>10)&0x1F;
|
|
1969
|
|
1970 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
|
|
1971 }
|
|
1972 }
|
|
1973
|
|
1974 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
|
|
1975 {
|
|
1976 int i;
|
|
1977 for(i=0; i<width; i++)
|
|
1978 {
|
|
1979 int d0= ((uint32_t*)src1)[i];
|
|
1980 int d1= ((uint32_t*)src2)[i];
|
|
1981
|
|
1982 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
|
|
1983 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
|
|
1984
|
|
1985 int dh2= (dh>>11) + (dh<<21);
|
|
1986 int d= dh2 + dl;
|
|
1987
|
|
1988 int b= d&0x7F;
|
|
1989 int r= (d>>10)&0x7F;
|
|
1990 int g= d>>21;
|
|
1991 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
|
|
1992 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
|
|
1993 }
|
|
1994 }
|
|
1995
|
|
1996
|
|
1997 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
|
|
1998 {
|
|
1999 int i;
|
|
2000 for(i=0; i<width; i++)
|
|
2001 {
|
|
2002 int r= ((uint32_t*)src)[i]&0xFF;
|
|
2003 int g= (((uint32_t*)src)[i]>>8)&0xFF;
|
|
2004 int b= (((uint32_t*)src)[i]>>16)&0xFF;
|
|
2005
|
|
2006 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
|
|
2007 }
|
|
2008 }
|
|
2009
|
|
2010 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
|
|
2011 {
|
|
2012 int i;
|
|
2013 for(i=0; i<width; i++)
|
|
2014 {
|
|
2015 const int a= ((uint32_t*)src1)[2*i+0];
|
|
2016 const int e= ((uint32_t*)src1)[2*i+1];
|
|
2017 const int c= ((uint32_t*)src2)[2*i+0];
|
|
2018 const int d= ((uint32_t*)src2)[2*i+1];
|
|
2019 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
|
|
2020 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
|
|
2021 const int r= l&0x3FF;
|
|
2022 const int g= h>>8;
|
|
2023 const int b= l>>16;
|
|
2024
|
|
2025 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
|
|
2026 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
|
|
2027 }
|
|
2028 }
|
|
2029
|
|
2030 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
|
|
2031 {
|
|
2032 int i;
|
|
2033 for(i=0; i<width; i++)
|
|
2034 {
|
|
2035 int r= src[i*3+0];
|
|
2036 int g= src[i*3+1];
|
|
2037 int b= src[i*3+2];
|
|
2038
|
|
2039 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
|
|
2040 }
|
|
2041 }
|
|
2042
|
|
2043 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
|
|
2044 {
|
|
2045 int i;
|
|
2046 for(i=0; i<width; i++)
|
|
2047 {
|
|
2048 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
|
|
2049 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
|
|
2050 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
|
|
2051
|
|
2052 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
|
|
2053 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
|
|
2054 }
|
|
2055 }
|
|
2056
|
|
2057
|
|
2058 // Bilinear / Bicubic scaling
|
|
2059 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
|
|
2060 int16_t *filter, int16_t *filterPos, long filterSize)
|
|
2061 {
|
|
2062 #ifdef HAVE_MMX
|
|
2063 assert(filterSize % 4 == 0 && filterSize>0);
|
|
2064 if(filterSize==4) // allways true for upscaling, sometimes for down too
|
|
2065 {
|
|
2066 long counter= -2*dstW;
|
|
2067 filter-= counter*2;
|
|
2068 filterPos-= counter/2;
|
|
2069 dst-= counter/2;
|
|
2070 asm volatile(
|
|
2071 "pxor %%mm7, %%mm7 \n\t"
|
|
2072 "movq "MANGLE(w02)", %%mm6 \n\t"
|
|
2073 "push %%"REG_BP" \n\t" // we use 7 regs here ...
|
|
2074 "mov %%"REG_a", %%"REG_BP" \n\t"
|
|
2075 ASMALIGN16
|
|
2076 "1: \n\t"
|
|
2077 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
|
|
2078 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
|
|
2079 "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
|
|
2080 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
|
|
2081 "movd (%3, %%"REG_a"), %%mm0 \n\t"
|
|
2082 "movd (%3, %%"REG_b"), %%mm2 \n\t"
|
|
2083 "punpcklbw %%mm7, %%mm0 \n\t"
|
|
2084 "punpcklbw %%mm7, %%mm2 \n\t"
|
|
2085 "pmaddwd %%mm1, %%mm0 \n\t"
|
|
2086 "pmaddwd %%mm2, %%mm3 \n\t"
|
|
2087 "psrad $8, %%mm0 \n\t"
|
|
2088 "psrad $8, %%mm3 \n\t"
|
|
2089 "packssdw %%mm3, %%mm0 \n\t"
|
|
2090 "pmaddwd %%mm6, %%mm0 \n\t"
|
|
2091 "packssdw %%mm0, %%mm0 \n\t"
|
|
2092 "movd %%mm0, (%4, %%"REG_BP") \n\t"
|
|
2093 "add $4, %%"REG_BP" \n\t"
|
|
2094 " jnc 1b \n\t"
|
|
2095
|
|
2096 "pop %%"REG_BP" \n\t"
|
|
2097 : "+a" (counter)
|
|
2098 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
|
|
2099 : "%"REG_b
|
|
2100 );
|
|
2101 }
|
|
2102 else if(filterSize==8)
|
|
2103 {
|
|
2104 long counter= -2*dstW;
|
|
2105 filter-= counter*4;
|
|
2106 filterPos-= counter/2;
|
|
2107 dst-= counter/2;
|
|
2108 asm volatile(
|
|
2109 "pxor %%mm7, %%mm7 \n\t"
|
|
2110 "movq "MANGLE(w02)", %%mm6 \n\t"
|
|
2111 "push %%"REG_BP" \n\t" // we use 7 regs here ...
|
|
2112 "mov %%"REG_a", %%"REG_BP" \n\t"
|
|
2113 ASMALIGN16
|
|
2114 "1: \n\t"
|
|
2115 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
|
|
2116 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
|
|
2117 "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
|
|
2118 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
|
|
2119 "movd (%3, %%"REG_a"), %%mm0 \n\t"
|
|
2120 "movd (%3, %%"REG_b"), %%mm2 \n\t"
|
|
2121 "punpcklbw %%mm7, %%mm0 \n\t"
|
|
2122 "punpcklbw %%mm7, %%mm2 \n\t"
|
|
2123 "pmaddwd %%mm1, %%mm0 \n\t"
|
|
2124 "pmaddwd %%mm2, %%mm3 \n\t"
|
|
2125
|
|
2126 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
|
|
2127 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
|
|
2128 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
|
|
2129 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
|
|
2130 "punpcklbw %%mm7, %%mm4 \n\t"
|
|
2131 "punpcklbw %%mm7, %%mm2 \n\t"
|
|
2132 "pmaddwd %%mm1, %%mm4 \n\t"
|
|
2133 "pmaddwd %%mm2, %%mm5 \n\t"
|
|
2134 "paddd %%mm4, %%mm0 \n\t"
|
|
2135 "paddd %%mm5, %%mm3 \n\t"
|
|
2136
|
|
2137 "psrad $8, %%mm0 \n\t"
|
|
2138 "psrad $8, %%mm3 \n\t"
|
|
2139 "packssdw %%mm3, %%mm0 \n\t"
|
|
2140 "pmaddwd %%mm6, %%mm0 \n\t"
|
|
2141 "packssdw %%mm0, %%mm0 \n\t"
|
|
2142 "movd %%mm0, (%4, %%"REG_BP") \n\t"
|
|
2143 "add $4, %%"REG_BP" \n\t"
|
|
2144 " jnc 1b \n\t"
|
|
2145
|
|
2146 "pop %%"REG_BP" \n\t"
|
|
2147 : "+a" (counter)
|
|
2148 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
|
|
2149 : "%"REG_b
|
|
2150 );
|
|
2151 }
|
|
2152 else
|
|
2153 {
|
|
2154 uint8_t *offset = src+filterSize;
|
|
2155 long counter= -2*dstW;
|
|
2156 // filter-= counter*filterSize/2;
|
|
2157 filterPos-= counter/2;
|
|
2158 dst-= counter/2;
|
|
2159 asm volatile(
|
|
2160 "pxor %%mm7, %%mm7 \n\t"
|
|
2161 "movq "MANGLE(w02)", %%mm6 \n\t"
|
|
2162 ASMALIGN16
|
|
2163 "1: \n\t"
|
|
2164 "mov %2, %%"REG_c" \n\t"
|
|
2165 "movzwl (%%"REG_c", %0), %%eax \n\t"
|
|
2166 "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
|
|
2167 "mov %5, %%"REG_c" \n\t"
|
|
2168 "pxor %%mm4, %%mm4 \n\t"
|
|
2169 "pxor %%mm5, %%mm5 \n\t"
|
|
2170 "2: \n\t"
|
|
2171 "movq (%1), %%mm1 \n\t"
|
|
2172 "movq (%1, %6), %%mm3 \n\t"
|
|
2173 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
|
|
2174 "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
|
|
2175 "punpcklbw %%mm7, %%mm0 \n\t"
|
|
2176 "punpcklbw %%mm7, %%mm2 \n\t"
|
|
2177 "pmaddwd %%mm1, %%mm0 \n\t"
|
|
2178 "pmaddwd %%mm2, %%mm3 \n\t"
|
|
2179 "paddd %%mm3, %%mm5 \n\t"
|
|
2180 "paddd %%mm0, %%mm4 \n\t"
|
|
2181 "add $8, %1 \n\t"
|
|
2182 "add $4, %%"REG_c" \n\t"
|
|
2183 "cmp %4, %%"REG_c" \n\t"
|
|
2184 " jb 2b \n\t"
|
|
2185 "add %6, %1 \n\t"
|
|
2186 "psrad $8, %%mm4 \n\t"
|
|
2187 "psrad $8, %%mm5 \n\t"
|
|
2188 "packssdw %%mm5, %%mm4 \n\t"
|
|
2189 "pmaddwd %%mm6, %%mm4 \n\t"
|
|
2190 "packssdw %%mm4, %%mm4 \n\t"
|
|
2191 "mov %3, %%"REG_a" \n\t"
|
|
2192 "movd %%mm4, (%%"REG_a", %0) \n\t"
|
|
2193 "add $4, %0 \n\t"
|
|
2194 " jnc 1b \n\t"
|
|
2195
|
|
2196 : "+r" (counter), "+r" (filter)
|
|
2197 : "m" (filterPos), "m" (dst), "m"(offset),
|
|
2198 "m" (src), "r" (filterSize*2)
|
|
2199 : "%"REG_b, "%"REG_a, "%"REG_c
|
|
2200 );
|
|
2201 }
|
|
2202 #else
|
|
2203 #ifdef HAVE_ALTIVEC
|
|
2204 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
|
|
2205 #else
|
|
2206 int i;
|
|
2207 for(i=0; i<dstW; i++)
|
|
2208 {
|
|
2209 int j;
|
|
2210 int srcPos= filterPos[i];
|
|
2211 int val=0;
|
|
2212 // printf("filterPos: %d\n", filterPos[i]);
|
|
2213 for(j=0; j<filterSize; j++)
|
|
2214 {
|
|
2215 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
|
|
2216 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
|
|
2217 }
|
|
2218 // filter += hFilterSize;
|
|
2219 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
|
|
2220 // dst[i] = val>>7;
|
|
2221 }
|
|
2222 #endif
|
|
2223 #endif
|
|
2224 }
|
|
2225 // *** horizontal scale Y line to temp buffer
|
|
2226 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
|
|
2227 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
|
|
2228 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
|
|
2229 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
|
|
2230 int32_t *mmx2FilterPos)
|
|
2231 {
|
|
2232 if(srcFormat==IMGFMT_YUY2)
|
|
2233 {
|
|
2234 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
|
|
2235 src= formatConvBuffer;
|
|
2236 }
|
|
2237 else if(srcFormat==IMGFMT_UYVY)
|
|
2238 {
|
|
2239 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
|
|
2240 src= formatConvBuffer;
|
|
2241 }
|
|
2242 else if(srcFormat==IMGFMT_BGR32)
|
|
2243 {
|
|
2244 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
|
|
2245 src= formatConvBuffer;
|
|
2246 }
|
|
2247 else if(srcFormat==IMGFMT_BGR24)
|
|
2248 {
|
|
2249 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
|
|
2250 src= formatConvBuffer;
|
|
2251 }
|
|
2252 else if(srcFormat==IMGFMT_BGR16)
|
|
2253 {
|
|
2254 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
|
|
2255 src= formatConvBuffer;
|
|
2256 }
|
|
2257 else if(srcFormat==IMGFMT_BGR15)
|
|
2258 {
|
|
2259 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
|
|
2260 src= formatConvBuffer;
|
|
2261 }
|
|
2262 else if(srcFormat==IMGFMT_RGB32)
|
|
2263 {
|
|
2264 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
|
|
2265 src= formatConvBuffer;
|
|
2266 }
|
|
2267 else if(srcFormat==IMGFMT_RGB24)
|
|
2268 {
|
|
2269 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
|
|
2270 src= formatConvBuffer;
|
|
2271 }
|
|
2272
|
|
2273 #ifdef HAVE_MMX
|
|
2274 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
|
|
2275 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
|
|
2276 #else
|
|
2277 if(!(flags&SWS_FAST_BILINEAR))
|
|
2278 #endif
|
|
2279 {
|
|
2280 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
|
|
2281 }
|
|
2282 else // Fast Bilinear upscale / crap downscale
|
|
2283 {
|
|
2284 #if defined(ARCH_X86) || defined(ARCH_X86_64)
|
|
2285 #ifdef HAVE_MMX2
|
|
2286 int i;
|
|
2287 if(canMMX2BeUsed)
|
|
2288 {
|
|
2289 asm volatile(
|
|
2290 "pxor %%mm7, %%mm7 \n\t"
|
|
2291 "mov %0, %%"REG_c" \n\t"
|
|
2292 "mov %1, %%"REG_D" \n\t"
|
|
2293 "mov %2, %%"REG_d" \n\t"
|
|
2294 "mov %3, %%"REG_b" \n\t"
|
|
2295 "xor %%"REG_a", %%"REG_a" \n\t" // i
|
|
2296 PREFETCH" (%%"REG_c") \n\t"
|
|
2297 PREFETCH" 32(%%"REG_c") \n\t"
|
|
2298 PREFETCH" 64(%%"REG_c") \n\t"
|
|
2299
|
|
2300 #ifdef ARCH_X86_64
|
|
2301
|
|
2302 #define FUNNY_Y_CODE \
|
|
2303 "movl (%%"REG_b"), %%esi \n\t"\
|
|
2304 "call *%4 \n\t"\
|
|
2305 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
|
|
2306 "add %%"REG_S", %%"REG_c" \n\t"\
|
|
2307 "add %%"REG_a", %%"REG_D" \n\t"\
|
|
2308 "xor %%"REG_a", %%"REG_a" \n\t"\
|
|
2309
|
|
2310 #else
|
|
2311
|
|
2312 #define FUNNY_Y_CODE \
|
|
2313 "movl (%%"REG_b"), %%esi \n\t"\
|
|
2314 "call *%4 \n\t"\
|
|
2315 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
|
|
2316 "add %%"REG_a", %%"REG_D" \n\t"\
|
|
2317 "xor %%"REG_a", %%"REG_a" \n\t"\
|
|
2318
|
|
2319 #endif
|
|
2320
|
|
2321 FUNNY_Y_CODE
|
|
2322 FUNNY_Y_CODE
|
|
2323 FUNNY_Y_CODE
|
|
2324 FUNNY_Y_CODE
|
|
2325 FUNNY_Y_CODE
|
|
2326 FUNNY_Y_CODE
|
|
2327 FUNNY_Y_CODE
|
|
2328 FUNNY_Y_CODE
|
|
2329
|
|
2330 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
|
|
2331 "m" (funnyYCode)
|
|
2332 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
|
|
2333 );
|
|
2334 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
|
|
2335 }
|
|
2336 else
|
|
2337 {
|
|
2338 #endif
|
|
2339 long xInc_shr16 = xInc >> 16;
|
|
2340 uint16_t xInc_mask = xInc & 0xffff;
|
|
2341 //NO MMX just normal asm ...
|
|
2342 asm volatile(
|
|
2343 "xor %%"REG_a", %%"REG_a" \n\t" // i
|
|
2344 "xor %%"REG_b", %%"REG_b" \n\t" // xx
|
|
2345 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
|
|
2346 ASMALIGN16
|
|
2347 "1: \n\t"
|
|
2348 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
|
|
2349 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
|
|
2350 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
|
|
2351 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
|
|
2352 "shll $16, %%edi \n\t"
|
|
2353 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
|
|
2354 "mov %1, %%"REG_D" \n\t"
|
|
2355 "shrl $9, %%esi \n\t"
|
|
2356 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
|
|
2357 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
|
|
2358 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
|
|
2359
|
|
2360 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
|
|
2361 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
|
|
2362 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
|
|
2363 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
|
|
2364 "shll $16, %%edi \n\t"
|
|
2365 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
|
|
2366 "mov %1, %%"REG_D" \n\t"
|
|
2367 "shrl $9, %%esi \n\t"
|
|
2368 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
|
|
2369 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
|
|
2370 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
|
|
2371
|
|
2372
|
|
2373 "add $2, %%"REG_a" \n\t"
|
|
2374 "cmp %2, %%"REG_a" \n\t"
|
|
2375 " jb 1b \n\t"
|
|
2376
|
|
2377
|
|
2378 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
|
|
2379 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
|
|
2380 );
|
|
2381 #ifdef HAVE_MMX2
|
|
2382 } //if MMX2 can't be used
|
|
2383 #endif
|
|
2384 #else
|
|
2385 int i;
|
|
2386 unsigned int xpos=0;
|
|
2387 for(i=0;i<dstWidth;i++)
|
|
2388 {
|
|
2389 register unsigned int xx=xpos>>16;
|
|
2390 register unsigned int xalpha=(xpos&0xFFFF)>>9;
|
|
2391 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
|
|
2392 xpos+=xInc;
|
|
2393 }
|
|
2394 #endif
|
|
2395 }
|
|
2396 }
|
|
2397
|
|
2398 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
|
|
2399 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
|
|
2400 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
|
|
2401 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
|
|
2402 int32_t *mmx2FilterPos)
|
|
2403 {
|
|
2404 if(srcFormat==IMGFMT_YUY2)
|
|
2405 {
|
|
2406 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
|
|
2407 src1= formatConvBuffer;
|
|
2408 src2= formatConvBuffer+2048;
|
|
2409 }
|
|
2410 else if(srcFormat==IMGFMT_UYVY)
|
|
2411 {
|
|
2412 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
|
|
2413 src1= formatConvBuffer;
|
|
2414 src2= formatConvBuffer+2048;
|
|
2415 }
|
|
2416 else if(srcFormat==IMGFMT_BGR32)
|
|
2417 {
|
|
2418 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
|
|
2419 src1= formatConvBuffer;
|
|
2420 src2= formatConvBuffer+2048;
|
|
2421 }
|
|
2422 else if(srcFormat==IMGFMT_BGR24)
|
|
2423 {
|
|
2424 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
|
|
2425 src1= formatConvBuffer;
|
|
2426 src2= formatConvBuffer+2048;
|
|
2427 }
|
|
2428 else if(srcFormat==IMGFMT_BGR16)
|
|
2429 {
|
|
2430 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
|
|
2431 src1= formatConvBuffer;
|
|
2432 src2= formatConvBuffer+2048;
|
|
2433 }
|
|
2434 else if(srcFormat==IMGFMT_BGR15)
|
|
2435 {
|
|
2436 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
|
|
2437 src1= formatConvBuffer;
|
|
2438 src2= formatConvBuffer+2048;
|
|
2439 }
|
|
2440 else if(srcFormat==IMGFMT_RGB32)
|
|
2441 {
|
|
2442 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
|
|
2443 src1= formatConvBuffer;
|
|
2444 src2= formatConvBuffer+2048;
|
|
2445 }
|
|
2446 else if(srcFormat==IMGFMT_RGB24)
|
|
2447 {
|
|
2448 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
|
|
2449 src1= formatConvBuffer;
|
|
2450 src2= formatConvBuffer+2048;
|
|
2451 }
|
|
2452 else if(isGray(srcFormat))
|
|
2453 {
|
|
2454 return;
|
|
2455 }
|
|
2456
|
|
2457 #ifdef HAVE_MMX
|
|
2458 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
|
|
2459 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
|
|
2460 #else
|
|
2461 if(!(flags&SWS_FAST_BILINEAR))
|
|
2462 #endif
|
|
2463 {
|
|
2464 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
|
|
2465 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
|
|
2466 }
|
|
2467 else // Fast Bilinear upscale / crap downscale
|
|
2468 {
|
|
2469 #if defined(ARCH_X86) || defined(ARCH_X86_64)
|
|
2470 #ifdef HAVE_MMX2
|
|
2471 int i;
|
|
2472 if(canMMX2BeUsed)
|
|
2473 {
|
|
2474 asm volatile(
|
|
2475 "pxor %%mm7, %%mm7 \n\t"
|
|
2476 "mov %0, %%"REG_c" \n\t"
|
|
2477 "mov %1, %%"REG_D" \n\t"
|
|
2478 "mov %2, %%"REG_d" \n\t"
|
|
2479 "mov %3, %%"REG_b" \n\t"
|
|
2480 "xor %%"REG_a", %%"REG_a" \n\t" // i
|
|
2481 PREFETCH" (%%"REG_c") \n\t"
|
|
2482 PREFETCH" 32(%%"REG_c") \n\t"
|
|
2483 PREFETCH" 64(%%"REG_c") \n\t"
|
|
2484
|
|
2485 #ifdef ARCH_X86_64
|
|
2486
|
|
2487 #define FUNNY_UV_CODE \
|
|
2488 "movl (%%"REG_b"), %%esi \n\t"\
|
|
2489 "call *%4 \n\t"\
|
|
2490 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
|
|
2491 "add %%"REG_S", %%"REG_c" \n\t"\
|
|
2492 "add %%"REG_a", %%"REG_D" \n\t"\
|
|
2493 "xor %%"REG_a", %%"REG_a" \n\t"\
|
|
2494
|
|
2495 #else
|
|
2496
|
|
2497 #define FUNNY_UV_CODE \
|
|
2498 "movl (%%"REG_b"), %%esi \n\t"\
|
|
2499 "call *%4 \n\t"\
|
|
2500 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
|
|
2501 "add %%"REG_a", %%"REG_D" \n\t"\
|
|
2502 "xor %%"REG_a", %%"REG_a" \n\t"\
|
|
2503
|
|
2504 #endif
|
|
2505
|
|
2506 FUNNY_UV_CODE
|
|
2507 FUNNY_UV_CODE
|
|
2508 FUNNY_UV_CODE
|
|
2509 FUNNY_UV_CODE
|
|
2510 "xor %%"REG_a", %%"REG_a" \n\t" // i
|
|
2511 "mov %5, %%"REG_c" \n\t" // src
|
|
2512 "mov %1, %%"REG_D" \n\t" // buf1
|
|
2513 "add $4096, %%"REG_D" \n\t"
|
|
2514 PREFETCH" (%%"REG_c") \n\t"
|
|
2515 PREFETCH" 32(%%"REG_c") \n\t"
|
|
2516 PREFETCH" 64(%%"REG_c") \n\t"
|
|
2517
|
|
2518 FUNNY_UV_CODE
|
|
2519 FUNNY_UV_CODE
|
|
2520 FUNNY_UV_CODE
|
|
2521 FUNNY_UV_CODE
|
|
2522
|
|
2523 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
|
|
2524 "m" (funnyUVCode), "m" (src2)
|
|
2525 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
|
|
2526 );
|
|
2527 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
|
|
2528 {
|
|
2529 // printf("%d %d %d\n", dstWidth, i, srcW);
|
|
2530 dst[i] = src1[srcW-1]*128;
|
|
2531 dst[i+2048] = src2[srcW-1]*128;
|
|
2532 }
|
|
2533 }
|
|
2534 else
|
|
2535 {
|
|
2536 #endif
|
|
2537 long xInc_shr16 = (long) (xInc >> 16);
|
|
2538 uint16_t xInc_mask = xInc & 0xffff;
|
|
2539 asm volatile(
|
|
2540 "xor %%"REG_a", %%"REG_a" \n\t" // i
|
|
2541 "xor %%"REG_b", %%"REG_b" \n\t" // xx
|
|
2542 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
|
|
2543 ASMALIGN16
|
|
2544 "1: \n\t"
|
|
2545 "mov %0, %%"REG_S" \n\t"
|
|
2546 "movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx]
|
|
2547 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi \n\t" //src[xx+1]
|
|
2548 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
|
|
2549 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
|
|
2550 "shll $16, %%edi \n\t"
|
|
2551 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
|
|
2552 "mov %1, %%"REG_D" \n\t"
|
|
2553 "shrl $9, %%esi \n\t"
|
|
2554 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
|
|
2555
|
|
2556 "movzbl (%5, %%"REG_b"), %%edi \n\t" //src[xx]
|
|
2557 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
|
|
2558 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
|
|
2559 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
|
|
2560 "shll $16, %%edi \n\t"
|
|
2561 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
|
|
2562 "mov %1, %%"REG_D" \n\t"
|
|
2563 "shrl $9, %%esi \n\t"
|
|
2564 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
|
|
2565
|
|
2566 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
|
|
2567 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
|
|
2568 "add $1, %%"REG_a" \n\t"
|
|
2569 "cmp %2, %%"REG_a" \n\t"
|
|
2570 " jb 1b \n\t"
|
|
2571
|
|
2572 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
|
|
2573 which is needed to support GCC-4.0 */
|
|
2574 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
|
|
2575 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
|
|
2576 #else
|
|
2577 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
|
|
2578 #endif
|
|
2579 "r" (src2)
|
|
2580 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
|
|
2581 );
|
|
2582 #ifdef HAVE_MMX2
|
|
2583 } //if MMX2 can't be used
|
|
2584 #endif
|
|
2585 #else
|
|
2586 int i;
|
|
2587 unsigned int xpos=0;
|
|
2588 for(i=0;i<dstWidth;i++)
|
|
2589 {
|
|
2590 register unsigned int xx=xpos>>16;
|
|
2591 register unsigned int xalpha=(xpos&0xFFFF)>>9;
|
|
2592 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
|
|
2593 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
|
|
2594 /* slower
|
|
2595 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
|
|
2596 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
|
|
2597 */
|
|
2598 xpos+=xInc;
|
|
2599 }
|
|
2600 #endif
|
|
2601 }
|
|
2602 }
|
|
2603
|
|
2604 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
|
|
2605 int srcSliceH, uint8_t* dst[], int dstStride[]){
|
|
2606
|
|
2607 /* load a few things into local vars to make the code more readable? and faster */
|
|
2608 const int srcW= c->srcW;
|
|
2609 const int dstW= c->dstW;
|
|
2610 const int dstH= c->dstH;
|
|
2611 const int chrDstW= c->chrDstW;
|
|
2612 const int chrSrcW= c->chrSrcW;
|
|
2613 const int lumXInc= c->lumXInc;
|
|
2614 const int chrXInc= c->chrXInc;
|
|
2615 const int dstFormat= c->dstFormat;
|
|
2616 const int srcFormat= c->srcFormat;
|
|
2617 const int flags= c->flags;
|
|
2618 const int canMMX2BeUsed= c->canMMX2BeUsed;
|
|
2619 int16_t *vLumFilterPos= c->vLumFilterPos;
|
|
2620 int16_t *vChrFilterPos= c->vChrFilterPos;
|
|
2621 int16_t *hLumFilterPos= c->hLumFilterPos;
|
|
2622 int16_t *hChrFilterPos= c->hChrFilterPos;
|
|
2623 int16_t *vLumFilter= c->vLumFilter;
|
|
2624 int16_t *vChrFilter= c->vChrFilter;
|
|
2625 int16_t *hLumFilter= c->hLumFilter;
|
|
2626 int16_t *hChrFilter= c->hChrFilter;
|
|
2627 int32_t *lumMmxFilter= c->lumMmxFilter;
|
|
2628 int32_t *chrMmxFilter= c->chrMmxFilter;
|
|
2629 const int vLumFilterSize= c->vLumFilterSize;
|
|
2630 const int vChrFilterSize= c->vChrFilterSize;
|
|
2631 const int hLumFilterSize= c->hLumFilterSize;
|
|
2632 const int hChrFilterSize= c->hChrFilterSize;
|
|
2633 int16_t **lumPixBuf= c->lumPixBuf;
|
|
2634 int16_t **chrPixBuf= c->chrPixBuf;
|
|
2635 const int vLumBufSize= c->vLumBufSize;
|
|
2636 const int vChrBufSize= c->vChrBufSize;
|
|
2637 uint8_t *funnyYCode= c->funnyYCode;
|
|
2638 uint8_t *funnyUVCode= c->funnyUVCode;
|
|
2639 uint8_t *formatConvBuffer= c->formatConvBuffer;
|
|
2640 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
|
|
2641 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
|
|
2642 int lastDstY;
|
|
2643
|
|
2644 /* vars whch will change and which we need to storw back in the context */
|
|
2645 int dstY= c->dstY;
|
|
2646 int lumBufIndex= c->lumBufIndex;
|
|
2647 int chrBufIndex= c->chrBufIndex;
|
|
2648 int lastInLumBuf= c->lastInLumBuf;
|
|
2649 int lastInChrBuf= c->lastInChrBuf;
|
|
2650
|
|
2651 if(isPacked(c->srcFormat)){
|
|
2652 src[0]=
|
|
2653 src[1]=
|
|
2654 src[2]= src[0];
|
|
2655 srcStride[0]=
|
|
2656 srcStride[1]=
|
|
2657 srcStride[2]= srcStride[0];
|
|
2658 }
|
|
2659 srcStride[1]<<= c->vChrDrop;
|
|
2660 srcStride[2]<<= c->vChrDrop;
|
|
2661
|
|
2662 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
|
|
2663 // (int)dst[0], (int)dst[1], (int)dst[2]);
|
|
2664
|
|
2665 #if 0 //self test FIXME move to a vfilter or something
|
|
2666 {
|
|
2667 static volatile int i=0;
|
|
2668 i++;
|
|
2669 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
|
|
2670 selfTest(src, srcStride, c->srcW, c->srcH);
|
|
2671 i--;
|
|
2672 }
|
|
2673 #endif
|
|
2674
|
|
2675 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
|
|
2676 //dstStride[0],dstStride[1],dstStride[2]);
|
|
2677
|
|
2678 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
|
|
2679 {
|
|
2680 static int firstTime=1; //FIXME move this into the context perhaps
|
|
2681 if(flags & SWS_PRINT_INFO && firstTime)
|
|
2682 {
|
|
2683 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
|
|
2684 "SwScaler: ->cannot do aligned memory acesses anymore\n");
|
|
2685 firstTime=0;
|
|
2686 }
|
|
2687 }
|
|
2688
|
|
2689 /* Note the user might start scaling the picture in the middle so this will not get executed
|
|
2690 this is not really intended but works currently, so ppl might do it */
|
|
2691 if(srcSliceY ==0){
|
|
2692 lumBufIndex=0;
|
|
2693 chrBufIndex=0;
|
|
2694 dstY=0;
|
|
2695 lastInLumBuf= -1;
|
|
2696 lastInChrBuf= -1;
|
|
2697 }
|
|
2698
|
|
2699 lastDstY= dstY;
|
|
2700
|
|
2701 for(;dstY < dstH; dstY++){
|
|
2702 unsigned char *dest =dst[0]+dstStride[0]*dstY;
|
|
2703 const int chrDstY= dstY>>c->chrDstVSubSample;
|
|
2704 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
|
|
2705 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
|
|
2706
|
|
2707 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
|
|
2708 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
|
|
2709 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
|
|
2710 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
|
|
2711
|
|
2712 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
|
|
2713 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
|
|
2714 //handle holes (FAST_BILINEAR & weird filters)
|
|
2715 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
|
|
2716 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
|
|
2717 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
|
|
2718 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
|
|
2719 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
|
|
2720
|
|
2721 // Do we have enough lines in this slice to output the dstY line
|
|
2722 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
|
|
2723 {
|
|
2724 //Do horizontal scaling
|
|
2725 while(lastInLumBuf < lastLumSrcY)
|
|
2726 {
|
|
2727 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
|
|
2728 lumBufIndex++;
|
|
2729 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
|
|
2730 ASSERT(lumBufIndex < 2*vLumBufSize)
|
|
2731 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
|
|
2732 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
|
|
2733 // printf("%d %d\n", lumBufIndex, vLumBufSize);
|
|
2734 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
|
|
2735 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
|
|
2736 funnyYCode, c->srcFormat, formatConvBuffer,
|
|
2737 c->lumMmx2Filter, c->lumMmx2FilterPos);
|
|
2738 lastInLumBuf++;
|
|
2739 }
|
|
2740 while(lastInChrBuf < lastChrSrcY)
|
|
2741 {
|
|
2742 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
|
|
2743 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
|
|
2744 chrBufIndex++;
|
|
2745 ASSERT(chrBufIndex < 2*vChrBufSize)
|
|
2746 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
|
|
2747 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
|
|
2748 //FIXME replace parameters through context struct (some at least)
|
|
2749
|
|
2750 if(!(isGray(srcFormat) || isGray(dstFormat)))
|
|
2751 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
|
|
2752 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
|
|
2753 funnyUVCode, c->srcFormat, formatConvBuffer,
|
|
2754 c->chrMmx2Filter, c->chrMmx2FilterPos);
|
|
2755 lastInChrBuf++;
|
|
2756 }
|
|
2757 //wrap buf index around to stay inside the ring buffer
|
|
2758 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
|
|
2759 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
|
|
2760 }
|
|
2761 else // not enough lines left in this slice -> load the rest in the buffer
|
|
2762 {
|
|
2763 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
|
|
2764 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
|
|
2765 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
|
|
2766 vChrBufSize, vLumBufSize);*/
|
|
2767
|
|
2768 //Do horizontal scaling
|
|
2769 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
|
|
2770 {
|
|
2771 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
|
|
2772 lumBufIndex++;
|
|
2773 ASSERT(lumBufIndex < 2*vLumBufSize)
|
|
2774 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
|
|
2775 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
|
|
2776 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
|
|
2777 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
|
|
2778 funnyYCode, c->srcFormat, formatConvBuffer,
|
|
2779 c->lumMmx2Filter, c->lumMmx2FilterPos);
|
|
2780 lastInLumBuf++;
|
|
2781 }
|
|
2782 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
|
|
2783 {
|
|
2784 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
|
|
2785 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
|
|
2786 chrBufIndex++;
|
|
2787 ASSERT(chrBufIndex < 2*vChrBufSize)
|
|
2788 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
|
|
2789 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
|
|
2790
|
|
2791 if(!(isGray(srcFormat) || isGray(dstFormat)))
|
|
2792 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
|
|
2793 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
|
|
2794 funnyUVCode, c->srcFormat, formatConvBuffer,
|
|
2795 c->chrMmx2Filter, c->chrMmx2FilterPos);
|
|
2796 lastInChrBuf++;
|
|
2797 }
|
|
2798 //wrap buf index around to stay inside the ring buffer
|
|
2799 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
|
|
2800 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
|
|
2801 break; //we can't output a dstY line so let's try with the next slice
|
|
2802 }
|
|
2803
|
|
2804 #ifdef HAVE_MMX
|
|
2805 b5Dither= dither8[dstY&1];
|
|
2806 g6Dither= dither4[dstY&1];
|
|
2807 g5Dither= dither8[dstY&1];
|
|
2808 r5Dither= dither8[(dstY+1)&1];
|
|
2809 #endif
|
|
2810 if(dstY < dstH-2)
|
|
2811 {
|
|
2812 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
|
|
2813 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
|
|
2814 #ifdef HAVE_MMX
|
|
2815 int i;
|
|
2816 for(i=0; i<vLumFilterSize; i++)
|
|
2817 {
|
|
2818 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
|
|
2819 lumMmxFilter[4*i+2]=
|
|
2820 lumMmxFilter[4*i+3]=
|
|
2821 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
|
|
2822 }
|
|
2823 for(i=0; i<vChrFilterSize; i++)
|
|
2824 {
|
|
2825 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
|
|
2826 chrMmxFilter[4*i+2]=
|
|
2827 chrMmxFilter[4*i+3]=
|
|
2828 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
|
|
2829 }
|
|
2830 #endif
|
|
2831 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
|
|
2832 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
|
|
2833 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
|
|
2834 RENAME(yuv2nv12X)(c,
|
|
2835 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
|
|
2836 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
|
|
2837 dest, uDest, dstW, chrDstW, dstFormat);
|
|
2838 }
|
|
2839 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
|
|
2840 {
|
|
2841 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
|
|
2842 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
|
|
2843 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
|
|
2844 {
|
|
2845 int16_t *lumBuf = lumPixBuf[0];
|
|
2846 int16_t *chrBuf= chrPixBuf[0];
|
|
2847 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
|
|
2848 }
|
|
2849 else //General YV12
|
|
2850 {
|
|
2851 RENAME(yuv2yuvX)(c,
|
|
2852 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
|
|
2853 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
|
|
2854 dest, uDest, vDest, dstW, chrDstW);
|
|
2855 }
|
|
2856 }
|
|
2857 else
|
|
2858 {
|
|
2859 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
|
|
2860 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
|
|
2861 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
|
|
2862 {
|
|
2863 int chrAlpha= vChrFilter[2*dstY+1];
|
|
2864 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
|
|
2865 dest, dstW, chrAlpha, dstFormat, flags, dstY);
|
|
2866 }
|
|
2867 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
|
|
2868 {
|
|
2869 int lumAlpha= vLumFilter[2*dstY+1];
|
|
2870 int chrAlpha= vChrFilter[2*dstY+1];
|
|
2871 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
|
|
2872 dest, dstW, lumAlpha, chrAlpha, dstY);
|
|
2873 }
|
|
2874 else //General RGB
|
|
2875 {
|
|
2876 RENAME(yuv2packedX)(c,
|
|
2877 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
|
|
2878 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
|
|
2879 dest, dstW, dstY);
|
|
2880 }
|
|
2881 }
|
|
2882 }
|
|
2883 else // hmm looks like we can't use MMX here without overwriting this array's tail
|
|
2884 {
|
|
2885 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
|
|
2886 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
|
|
2887 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
|
|
2888 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
|
|
2889 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
|
|
2890 yuv2nv12XinC(
|
|
2891 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
|
|
2892 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
|
|
2893 dest, uDest, dstW, chrDstW, dstFormat);
|
|
2894 }
|
|
2895 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
|
|
2896 {
|
|
2897 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
|
|
2898 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
|
|
2899 yuv2yuvXinC(
|
|
2900 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
|
|
2901 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
|
|
2902 dest, uDest, vDest, dstW, chrDstW);
|
|
2903 }
|
|
2904 else
|
|
2905 {
|
|
2906 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
|
|
2907 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
|
|
2908 yuv2packedXinC(c,
|
|
2909 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
|
|
2910 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
|
|
2911 dest, dstW, dstY);
|
|
2912 }
|
|
2913 }
|
|
2914 }
|
|
2915
|
|
2916 #ifdef HAVE_MMX
|
|
2917 __asm __volatile(SFENCE:::"memory");
|
|
2918 __asm __volatile(EMMS:::"memory");
|
|
2919 #endif
|
|
2920 /* store changed local vars back in the context */
|
|
2921 c->dstY= dstY;
|
|
2922 c->lumBufIndex= lumBufIndex;
|
|
2923 c->chrBufIndex= chrBufIndex;
|
|
2924 c->lastInLumBuf= lastInLumBuf;
|
|
2925 c->lastInChrBuf= lastInChrBuf;
|
|
2926
|
|
2927 return dstY - lastDstY;
|
|
2928 }
|