comparison libswscale/swscale_template.c @ 18861:8579acff875e

Move postproc ---> libswscale
author lucabe
date Fri, 30 Jun 2006 12:00:31 +0000
parents
children bae6c99a99cc
comparison
equal deleted inserted replaced
18860:ef741a3e90f5 18861:8579acff875e
1 /*
2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 #include "asmalign.h"
20
21 #undef REAL_MOVNTQ
22 #undef MOVNTQ
23 #undef PAVGB
24 #undef PREFETCH
25 #undef PREFETCHW
26 #undef EMMS
27 #undef SFENCE
28
29 #ifdef HAVE_3DNOW
30 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
31 #define EMMS "femms"
32 #else
33 #define EMMS "emms"
34 #endif
35
36 #ifdef HAVE_3DNOW
37 #define PREFETCH "prefetch"
38 #define PREFETCHW "prefetchw"
39 #elif defined ( HAVE_MMX2 )
40 #define PREFETCH "prefetchnta"
41 #define PREFETCHW "prefetcht0"
42 #else
43 #define PREFETCH "/nop"
44 #define PREFETCHW "/nop"
45 #endif
46
47 #ifdef HAVE_MMX2
48 #define SFENCE "sfence"
49 #else
50 #define SFENCE "/nop"
51 #endif
52
53 #ifdef HAVE_MMX2
54 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
55 #elif defined (HAVE_3DNOW)
56 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
57 #endif
58
59 #ifdef HAVE_MMX2
60 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
61 #else
62 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
63 #endif
64 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
65
66 #ifdef HAVE_ALTIVEC
67 #include "swscale_altivec_template.c"
68 #endif
69
70 #define YSCALEYUV2YV12X(x, offset) \
71 "xor %%"REG_a", %%"REG_a" \n\t"\
72 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
73 "movq %%mm3, %%mm4 \n\t"\
74 "lea " offset "(%0), %%"REG_d" \n\t"\
75 "mov (%%"REG_d"), %%"REG_S" \n\t"\
76 ASMALIGN16 /* FIXME Unroll? */\
77 "1: \n\t"\
78 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
79 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
80 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
81 "add $16, %%"REG_d" \n\t"\
82 "mov (%%"REG_d"), %%"REG_S" \n\t"\
83 "test %%"REG_S", %%"REG_S" \n\t"\
84 "pmulhw %%mm0, %%mm2 \n\t"\
85 "pmulhw %%mm0, %%mm5 \n\t"\
86 "paddw %%mm2, %%mm3 \n\t"\
87 "paddw %%mm5, %%mm4 \n\t"\
88 " jnz 1b \n\t"\
89 "psraw $3, %%mm3 \n\t"\
90 "psraw $3, %%mm4 \n\t"\
91 "packuswb %%mm4, %%mm3 \n\t"\
92 MOVNTQ(%%mm3, (%1, %%REGa))\
93 "add $8, %%"REG_a" \n\t"\
94 "cmp %2, %%"REG_a" \n\t"\
95 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
96 "movq %%mm3, %%mm4 \n\t"\
97 "lea " offset "(%0), %%"REG_d" \n\t"\
98 "mov (%%"REG_d"), %%"REG_S" \n\t"\
99 "jb 1b \n\t"
100
101 #define YSCALEYUV2YV121 \
102 "mov %2, %%"REG_a" \n\t"\
103 ASMALIGN16 /* FIXME Unroll? */\
104 "1: \n\t"\
105 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
106 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
107 "psraw $7, %%mm0 \n\t"\
108 "psraw $7, %%mm1 \n\t"\
109 "packuswb %%mm1, %%mm0 \n\t"\
110 MOVNTQ(%%mm0, (%1, %%REGa))\
111 "add $8, %%"REG_a" \n\t"\
112 "jnc 1b \n\t"
113
114 /*
115 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
116 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
117 "r" (dest), "m" (dstW),
118 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
119 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
120 */
121 #define YSCALEYUV2PACKEDX \
122 "xor %%"REG_a", %%"REG_a" \n\t"\
123 ASMALIGN16\
124 "nop \n\t"\
125 "1: \n\t"\
126 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
127 "mov (%%"REG_d"), %%"REG_S" \n\t"\
128 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
129 "movq %%mm3, %%mm4 \n\t"\
130 ASMALIGN16\
131 "2: \n\t"\
132 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
133 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
134 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
135 "add $16, %%"REG_d" \n\t"\
136 "mov (%%"REG_d"), %%"REG_S" \n\t"\
137 "pmulhw %%mm0, %%mm2 \n\t"\
138 "pmulhw %%mm0, %%mm5 \n\t"\
139 "paddw %%mm2, %%mm3 \n\t"\
140 "paddw %%mm5, %%mm4 \n\t"\
141 "test %%"REG_S", %%"REG_S" \n\t"\
142 " jnz 2b \n\t"\
143 \
144 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
145 "mov (%%"REG_d"), %%"REG_S" \n\t"\
146 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
147 "movq %%mm1, %%mm7 \n\t"\
148 ASMALIGN16\
149 "2: \n\t"\
150 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
151 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
152 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
153 "add $16, %%"REG_d" \n\t"\
154 "mov (%%"REG_d"), %%"REG_S" \n\t"\
155 "pmulhw %%mm0, %%mm2 \n\t"\
156 "pmulhw %%mm0, %%mm5 \n\t"\
157 "paddw %%mm2, %%mm1 \n\t"\
158 "paddw %%mm5, %%mm7 \n\t"\
159 "test %%"REG_S", %%"REG_S" \n\t"\
160 " jnz 2b \n\t"\
161
162
163 #define YSCALEYUV2RGBX \
164 YSCALEYUV2PACKEDX\
165 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
166 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
167 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
168 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
169 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
170 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
171 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
172 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
173 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
174 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
175 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
176 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
177 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
178 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
179 "paddw %%mm3, %%mm4 \n\t"\
180 "movq %%mm2, %%mm0 \n\t"\
181 "movq %%mm5, %%mm6 \n\t"\
182 "movq %%mm4, %%mm3 \n\t"\
183 "punpcklwd %%mm2, %%mm2 \n\t"\
184 "punpcklwd %%mm5, %%mm5 \n\t"\
185 "punpcklwd %%mm4, %%mm4 \n\t"\
186 "paddw %%mm1, %%mm2 \n\t"\
187 "paddw %%mm1, %%mm5 \n\t"\
188 "paddw %%mm1, %%mm4 \n\t"\
189 "punpckhwd %%mm0, %%mm0 \n\t"\
190 "punpckhwd %%mm6, %%mm6 \n\t"\
191 "punpckhwd %%mm3, %%mm3 \n\t"\
192 "paddw %%mm7, %%mm0 \n\t"\
193 "paddw %%mm7, %%mm6 \n\t"\
194 "paddw %%mm7, %%mm3 \n\t"\
195 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
196 "packuswb %%mm0, %%mm2 \n\t"\
197 "packuswb %%mm6, %%mm5 \n\t"\
198 "packuswb %%mm3, %%mm4 \n\t"\
199 "pxor %%mm7, %%mm7 \n\t"
200 #if 0
201 #define FULL_YSCALEYUV2RGB \
202 "pxor %%mm7, %%mm7 \n\t"\
203 "movd %6, %%mm6 \n\t" /*yalpha1*/\
204 "punpcklwd %%mm6, %%mm6 \n\t"\
205 "punpcklwd %%mm6, %%mm6 \n\t"\
206 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
207 "punpcklwd %%mm5, %%mm5 \n\t"\
208 "punpcklwd %%mm5, %%mm5 \n\t"\
209 "xor %%"REG_a", %%"REG_a" \n\t"\
210 ASMALIGN16\
211 "1: \n\t"\
212 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
213 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
214 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
215 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
216 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
217 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
218 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
219 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
220 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
221 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
222 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
223 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
224 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
225 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
226 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
227 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
228 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
229 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
230 \
231 \
232 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
233 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
234 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
235 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
236 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
237 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
238 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
239 \
240 \
241 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
242 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
243 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
244 "paddw %%mm1, %%mm3 \n\t" /* B*/\
245 "paddw %%mm1, %%mm0 \n\t" /* R*/\
246 "packuswb %%mm3, %%mm3 \n\t"\
247 \
248 "packuswb %%mm0, %%mm0 \n\t"\
249 "paddw %%mm4, %%mm2 \n\t"\
250 "paddw %%mm2, %%mm1 \n\t" /* G*/\
251 \
252 "packuswb %%mm1, %%mm1 \n\t"
253 #endif
254
255 #define REAL_YSCALEYUV2PACKED(index, c) \
256 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
257 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
258 "psraw $3, %%mm0 \n\t"\
259 "psraw $3, %%mm1 \n\t"\
260 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
261 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
262 "xor "#index", "#index" \n\t"\
263 ASMALIGN16\
264 "1: \n\t"\
265 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
266 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
267 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
268 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
269 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
270 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
271 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
272 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
273 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
274 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
275 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
276 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
277 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
278 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
279 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
280 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
281 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
282 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
283 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
284 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
285 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
286 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
287 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
288 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
289 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
290
291 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
292
293 #define REAL_YSCALEYUV2RGB(index, c) \
294 "xor "#index", "#index" \n\t"\
295 ASMALIGN16\
296 "1: \n\t"\
297 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
298 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
299 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
300 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
301 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
302 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
303 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
304 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
305 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
306 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
307 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
308 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
309 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
310 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
311 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
312 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
313 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
314 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
315 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
316 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
317 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
318 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
319 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
320 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
321 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
322 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
323 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
324 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
325 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
326 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
327 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
328 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
329 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
330 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
331 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
332 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
333 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
334 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
335 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
336 "paddw %%mm3, %%mm4 \n\t"\
337 "movq %%mm2, %%mm0 \n\t"\
338 "movq %%mm5, %%mm6 \n\t"\
339 "movq %%mm4, %%mm3 \n\t"\
340 "punpcklwd %%mm2, %%mm2 \n\t"\
341 "punpcklwd %%mm5, %%mm5 \n\t"\
342 "punpcklwd %%mm4, %%mm4 \n\t"\
343 "paddw %%mm1, %%mm2 \n\t"\
344 "paddw %%mm1, %%mm5 \n\t"\
345 "paddw %%mm1, %%mm4 \n\t"\
346 "punpckhwd %%mm0, %%mm0 \n\t"\
347 "punpckhwd %%mm6, %%mm6 \n\t"\
348 "punpckhwd %%mm3, %%mm3 \n\t"\
349 "paddw %%mm7, %%mm0 \n\t"\
350 "paddw %%mm7, %%mm6 \n\t"\
351 "paddw %%mm7, %%mm3 \n\t"\
352 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
353 "packuswb %%mm0, %%mm2 \n\t"\
354 "packuswb %%mm6, %%mm5 \n\t"\
355 "packuswb %%mm3, %%mm4 \n\t"\
356 "pxor %%mm7, %%mm7 \n\t"
357 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
358
359 #define REAL_YSCALEYUV2PACKED1(index, c) \
360 "xor "#index", "#index" \n\t"\
361 ASMALIGN16\
362 "1: \n\t"\
363 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
364 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
365 "psraw $7, %%mm3 \n\t" \
366 "psraw $7, %%mm4 \n\t" \
367 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
368 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
369 "psraw $7, %%mm1 \n\t" \
370 "psraw $7, %%mm7 \n\t" \
371
372 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
373
374 #define REAL_YSCALEYUV2RGB1(index, c) \
375 "xor "#index", "#index" \n\t"\
376 ASMALIGN16\
377 "1: \n\t"\
378 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
379 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
380 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
381 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
382 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
383 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
384 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
385 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
386 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
387 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
388 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
389 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
390 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
391 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
392 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
393 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
394 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
395 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
396 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
397 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
398 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
399 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
400 "paddw %%mm3, %%mm4 \n\t"\
401 "movq %%mm2, %%mm0 \n\t"\
402 "movq %%mm5, %%mm6 \n\t"\
403 "movq %%mm4, %%mm3 \n\t"\
404 "punpcklwd %%mm2, %%mm2 \n\t"\
405 "punpcklwd %%mm5, %%mm5 \n\t"\
406 "punpcklwd %%mm4, %%mm4 \n\t"\
407 "paddw %%mm1, %%mm2 \n\t"\
408 "paddw %%mm1, %%mm5 \n\t"\
409 "paddw %%mm1, %%mm4 \n\t"\
410 "punpckhwd %%mm0, %%mm0 \n\t"\
411 "punpckhwd %%mm6, %%mm6 \n\t"\
412 "punpckhwd %%mm3, %%mm3 \n\t"\
413 "paddw %%mm7, %%mm0 \n\t"\
414 "paddw %%mm7, %%mm6 \n\t"\
415 "paddw %%mm7, %%mm3 \n\t"\
416 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
417 "packuswb %%mm0, %%mm2 \n\t"\
418 "packuswb %%mm6, %%mm5 \n\t"\
419 "packuswb %%mm3, %%mm4 \n\t"\
420 "pxor %%mm7, %%mm7 \n\t"
421 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
422
423 #define REAL_YSCALEYUV2PACKED1b(index, c) \
424 "xor "#index", "#index" \n\t"\
425 ASMALIGN16\
426 "1: \n\t"\
427 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
428 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
429 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
430 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
431 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
432 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
433 "psrlw $8, %%mm3 \n\t" \
434 "psrlw $8, %%mm4 \n\t" \
435 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
436 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
437 "psraw $7, %%mm1 \n\t" \
438 "psraw $7, %%mm7 \n\t"
439 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
440
441 // do vertical chrominance interpolation
442 #define REAL_YSCALEYUV2RGB1b(index, c) \
443 "xor "#index", "#index" \n\t"\
444 ASMALIGN16\
445 "1: \n\t"\
446 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
447 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
448 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
449 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
450 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
451 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
452 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
453 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
454 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
455 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
456 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
457 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
458 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
459 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
460 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
461 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
462 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
463 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
464 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
465 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
466 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
467 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
468 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
469 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
470 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
471 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
472 "paddw %%mm3, %%mm4 \n\t"\
473 "movq %%mm2, %%mm0 \n\t"\
474 "movq %%mm5, %%mm6 \n\t"\
475 "movq %%mm4, %%mm3 \n\t"\
476 "punpcklwd %%mm2, %%mm2 \n\t"\
477 "punpcklwd %%mm5, %%mm5 \n\t"\
478 "punpcklwd %%mm4, %%mm4 \n\t"\
479 "paddw %%mm1, %%mm2 \n\t"\
480 "paddw %%mm1, %%mm5 \n\t"\
481 "paddw %%mm1, %%mm4 \n\t"\
482 "punpckhwd %%mm0, %%mm0 \n\t"\
483 "punpckhwd %%mm6, %%mm6 \n\t"\
484 "punpckhwd %%mm3, %%mm3 \n\t"\
485 "paddw %%mm7, %%mm0 \n\t"\
486 "paddw %%mm7, %%mm6 \n\t"\
487 "paddw %%mm7, %%mm3 \n\t"\
488 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
489 "packuswb %%mm0, %%mm2 \n\t"\
490 "packuswb %%mm6, %%mm5 \n\t"\
491 "packuswb %%mm3, %%mm4 \n\t"\
492 "pxor %%mm7, %%mm7 \n\t"
493 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
494
495 #define REAL_WRITEBGR32(dst, dstw, index) \
496 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
497 "movq %%mm2, %%mm1 \n\t" /* B */\
498 "movq %%mm5, %%mm6 \n\t" /* R */\
499 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
500 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
501 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
502 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
503 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
504 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
505 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
506 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
507 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
508 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
509 \
510 MOVNTQ(%%mm0, (dst, index, 4))\
511 MOVNTQ(%%mm2, 8(dst, index, 4))\
512 MOVNTQ(%%mm1, 16(dst, index, 4))\
513 MOVNTQ(%%mm3, 24(dst, index, 4))\
514 \
515 "add $8, "#index" \n\t"\
516 "cmp "#dstw", "#index" \n\t"\
517 " jb 1b \n\t"
518 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
519
520 #define REAL_WRITEBGR16(dst, dstw, index) \
521 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
522 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
523 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
524 "psrlq $3, %%mm2 \n\t"\
525 \
526 "movq %%mm2, %%mm1 \n\t"\
527 "movq %%mm4, %%mm3 \n\t"\
528 \
529 "punpcklbw %%mm7, %%mm3 \n\t"\
530 "punpcklbw %%mm5, %%mm2 \n\t"\
531 "punpckhbw %%mm7, %%mm4 \n\t"\
532 "punpckhbw %%mm5, %%mm1 \n\t"\
533 \
534 "psllq $3, %%mm3 \n\t"\
535 "psllq $3, %%mm4 \n\t"\
536 \
537 "por %%mm3, %%mm2 \n\t"\
538 "por %%mm4, %%mm1 \n\t"\
539 \
540 MOVNTQ(%%mm2, (dst, index, 2))\
541 MOVNTQ(%%mm1, 8(dst, index, 2))\
542 \
543 "add $8, "#index" \n\t"\
544 "cmp "#dstw", "#index" \n\t"\
545 " jb 1b \n\t"
546 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
547
548 #define REAL_WRITEBGR15(dst, dstw, index) \
549 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
550 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
551 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
552 "psrlq $3, %%mm2 \n\t"\
553 "psrlq $1, %%mm5 \n\t"\
554 \
555 "movq %%mm2, %%mm1 \n\t"\
556 "movq %%mm4, %%mm3 \n\t"\
557 \
558 "punpcklbw %%mm7, %%mm3 \n\t"\
559 "punpcklbw %%mm5, %%mm2 \n\t"\
560 "punpckhbw %%mm7, %%mm4 \n\t"\
561 "punpckhbw %%mm5, %%mm1 \n\t"\
562 \
563 "psllq $2, %%mm3 \n\t"\
564 "psllq $2, %%mm4 \n\t"\
565 \
566 "por %%mm3, %%mm2 \n\t"\
567 "por %%mm4, %%mm1 \n\t"\
568 \
569 MOVNTQ(%%mm2, (dst, index, 2))\
570 MOVNTQ(%%mm1, 8(dst, index, 2))\
571 \
572 "add $8, "#index" \n\t"\
573 "cmp "#dstw", "#index" \n\t"\
574 " jb 1b \n\t"
575 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
576
577 #define WRITEBGR24OLD(dst, dstw, index) \
578 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
579 "movq %%mm2, %%mm1 \n\t" /* B */\
580 "movq %%mm5, %%mm6 \n\t" /* R */\
581 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
582 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
583 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
584 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
585 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
586 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
587 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
588 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
589 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
590 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
591 \
592 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
593 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
594 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
595 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
596 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
597 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
598 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
599 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
600 \
601 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
602 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
603 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
604 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
605 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
606 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
607 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
608 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
609 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
610 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
611 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
612 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
613 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
614 \
615 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
616 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
617 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
618 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
619 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
620 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
621 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
622 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
623 \
624 MOVNTQ(%%mm0, (dst))\
625 MOVNTQ(%%mm2, 8(dst))\
626 MOVNTQ(%%mm3, 16(dst))\
627 "add $24, "#dst" \n\t"\
628 \
629 "add $8, "#index" \n\t"\
630 "cmp "#dstw", "#index" \n\t"\
631 " jb 1b \n\t"
632
633 #define WRITEBGR24MMX(dst, dstw, index) \
634 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
635 "movq %%mm2, %%mm1 \n\t" /* B */\
636 "movq %%mm5, %%mm6 \n\t" /* R */\
637 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
638 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
639 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
640 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
641 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
642 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
643 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
644 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
645 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
646 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
647 \
648 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
649 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
650 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
651 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
652 \
653 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
654 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
655 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
656 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
657 \
658 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
659 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
660 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
661 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
662 \
663 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
664 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
665 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
666 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
667 MOVNTQ(%%mm0, (dst))\
668 \
669 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
670 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
671 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
672 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
673 MOVNTQ(%%mm6, 8(dst))\
674 \
675 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
676 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
677 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
678 MOVNTQ(%%mm5, 16(dst))\
679 \
680 "add $24, "#dst" \n\t"\
681 \
682 "add $8, "#index" \n\t"\
683 "cmp "#dstw", "#index" \n\t"\
684 " jb 1b \n\t"
685
686 #define WRITEBGR24MMX2(dst, dstw, index) \
687 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
688 "movq "MANGLE(M24A)", %%mm0 \n\t"\
689 "movq "MANGLE(M24C)", %%mm7 \n\t"\
690 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
691 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
692 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
693 \
694 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
695 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
696 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
697 \
698 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
699 "por %%mm1, %%mm6 \n\t"\
700 "por %%mm3, %%mm6 \n\t"\
701 MOVNTQ(%%mm6, (dst))\
702 \
703 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
704 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
705 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
706 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
707 \
708 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
709 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
710 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
711 \
712 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
713 "por %%mm3, %%mm6 \n\t"\
714 MOVNTQ(%%mm6, 8(dst))\
715 \
716 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
717 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
718 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
719 \
720 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
721 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
722 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
723 \
724 "por %%mm1, %%mm3 \n\t"\
725 "por %%mm3, %%mm6 \n\t"\
726 MOVNTQ(%%mm6, 16(dst))\
727 \
728 "add $24, "#dst" \n\t"\
729 \
730 "add $8, "#index" \n\t"\
731 "cmp "#dstw", "#index" \n\t"\
732 " jb 1b \n\t"
733
734 #ifdef HAVE_MMX2
735 #undef WRITEBGR24
736 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
737 #else
738 #undef WRITEBGR24
739 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
740 #endif
741
742 #define REAL_WRITEYUY2(dst, dstw, index) \
743 "packuswb %%mm3, %%mm3 \n\t"\
744 "packuswb %%mm4, %%mm4 \n\t"\
745 "packuswb %%mm7, %%mm1 \n\t"\
746 "punpcklbw %%mm4, %%mm3 \n\t"\
747 "movq %%mm1, %%mm7 \n\t"\
748 "punpcklbw %%mm3, %%mm1 \n\t"\
749 "punpckhbw %%mm3, %%mm7 \n\t"\
750 \
751 MOVNTQ(%%mm1, (dst, index, 2))\
752 MOVNTQ(%%mm7, 8(dst, index, 2))\
753 \
754 "add $8, "#index" \n\t"\
755 "cmp "#dstw", "#index" \n\t"\
756 " jb 1b \n\t"
757 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
758
759
760 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
761 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
762 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
763 {
764 #ifdef HAVE_MMX
765 if(uDest != NULL)
766 {
767 asm volatile(
768 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
769 :: "r" (&c->redDither),
770 "r" (uDest), "p" (chrDstW)
771 : "%"REG_a, "%"REG_d, "%"REG_S
772 );
773
774 asm volatile(
775 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
776 :: "r" (&c->redDither),
777 "r" (vDest), "p" (chrDstW)
778 : "%"REG_a, "%"REG_d, "%"REG_S
779 );
780 }
781
782 asm volatile(
783 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
784 :: "r" (&c->redDither),
785 "r" (dest), "p" (dstW)
786 : "%"REG_a, "%"REG_d, "%"REG_S
787 );
788 #else
789 #ifdef HAVE_ALTIVEC
790 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
791 chrFilter, chrSrc, chrFilterSize,
792 dest, uDest, vDest, dstW, chrDstW);
793 #else //HAVE_ALTIVEC
794 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
795 chrFilter, chrSrc, chrFilterSize,
796 dest, uDest, vDest, dstW, chrDstW);
797 #endif //!HAVE_ALTIVEC
798 #endif
799 }
800
801 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
802 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
803 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
804 {
805 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
806 chrFilter, chrSrc, chrFilterSize,
807 dest, uDest, dstW, chrDstW, dstFormat);
808 }
809
810 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
811 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
812 {
813 #ifdef HAVE_MMX
814 if(uDest != NULL)
815 {
816 asm volatile(
817 YSCALEYUV2YV121
818 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
819 "g" (-chrDstW)
820 : "%"REG_a
821 );
822
823 asm volatile(
824 YSCALEYUV2YV121
825 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
826 "g" (-chrDstW)
827 : "%"REG_a
828 );
829 }
830
831 asm volatile(
832 YSCALEYUV2YV121
833 :: "r" (lumSrc + dstW), "r" (dest + dstW),
834 "g" (-dstW)
835 : "%"REG_a
836 );
837 #else
838 int i;
839 for(i=0; i<dstW; i++)
840 {
841 int val= lumSrc[i]>>7;
842
843 if(val&256){
844 if(val<0) val=0;
845 else val=255;
846 }
847
848 dest[i]= val;
849 }
850
851 if(uDest != NULL)
852 for(i=0; i<chrDstW; i++)
853 {
854 int u=chrSrc[i]>>7;
855 int v=chrSrc[i + 2048]>>7;
856
857 if((u|v)&256){
858 if(u<0) u=0;
859 else if (u>255) u=255;
860 if(v<0) v=0;
861 else if (v>255) v=255;
862 }
863
864 uDest[i]= u;
865 vDest[i]= v;
866 }
867 #endif
868 }
869
870
871 /**
872 * vertical scale YV12 to RGB
873 */
874 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
875 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
876 uint8_t *dest, long dstW, long dstY)
877 {
878 long dummy=0;
879 switch(c->dstFormat)
880 {
881 #ifdef HAVE_MMX
882 case IMGFMT_BGR32:
883 {
884 asm volatile(
885 YSCALEYUV2RGBX
886 WRITEBGR32(%4, %5, %%REGa)
887
888 :: "r" (&c->redDither),
889 "m" (dummy), "m" (dummy), "m" (dummy),
890 "r" (dest), "m" (dstW)
891 : "%"REG_a, "%"REG_d, "%"REG_S
892 );
893 }
894 break;
895 case IMGFMT_BGR24:
896 {
897 asm volatile(
898 YSCALEYUV2RGBX
899 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
900 "add %4, %%"REG_b" \n\t"
901 WRITEBGR24(%%REGb, %5, %%REGa)
902
903 :: "r" (&c->redDither),
904 "m" (dummy), "m" (dummy), "m" (dummy),
905 "r" (dest), "m" (dstW)
906 : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
907 );
908 }
909 break;
910 case IMGFMT_BGR15:
911 {
912 asm volatile(
913 YSCALEYUV2RGBX
914 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
915 #ifdef DITHER1XBPP
916 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
917 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
918 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
919 #endif
920
921 WRITEBGR15(%4, %5, %%REGa)
922
923 :: "r" (&c->redDither),
924 "m" (dummy), "m" (dummy), "m" (dummy),
925 "r" (dest), "m" (dstW)
926 : "%"REG_a, "%"REG_d, "%"REG_S
927 );
928 }
929 break;
930 case IMGFMT_BGR16:
931 {
932 asm volatile(
933 YSCALEYUV2RGBX
934 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
935 #ifdef DITHER1XBPP
936 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
937 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
938 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
939 #endif
940
941 WRITEBGR16(%4, %5, %%REGa)
942
943 :: "r" (&c->redDither),
944 "m" (dummy), "m" (dummy), "m" (dummy),
945 "r" (dest), "m" (dstW)
946 : "%"REG_a, "%"REG_d, "%"REG_S
947 );
948 }
949 break;
950 case IMGFMT_YUY2:
951 {
952 asm volatile(
953 YSCALEYUV2PACKEDX
954 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
955
956 "psraw $3, %%mm3 \n\t"
957 "psraw $3, %%mm4 \n\t"
958 "psraw $3, %%mm1 \n\t"
959 "psraw $3, %%mm7 \n\t"
960 WRITEYUY2(%4, %5, %%REGa)
961
962 :: "r" (&c->redDither),
963 "m" (dummy), "m" (dummy), "m" (dummy),
964 "r" (dest), "m" (dstW)
965 : "%"REG_a, "%"REG_d, "%"REG_S
966 );
967 }
968 break;
969 #endif
970 default:
971 #ifdef HAVE_ALTIVEC
972 /* The following list of supported dstFormat values should
973 match what's found in the body of altivec_yuv2packedX() */
974 if(c->dstFormat==IMGFMT_ABGR || c->dstFormat==IMGFMT_BGRA ||
975 c->dstFormat==IMGFMT_BGR24 || c->dstFormat==IMGFMT_RGB24 ||
976 c->dstFormat==IMGFMT_RGBA || c->dstFormat==IMGFMT_ARGB)
977 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
978 chrFilter, chrSrc, chrFilterSize,
979 dest, dstW, dstY);
980 else
981 #endif
982 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
983 chrFilter, chrSrc, chrFilterSize,
984 dest, dstW, dstY);
985 break;
986 }
987 }
988
989 /**
990 * vertical bilinear scale YV12 to RGB
991 */
992 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
993 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
994 {
995 int yalpha1=yalpha^4095;
996 int uvalpha1=uvalpha^4095;
997 int i;
998
999 #if 0 //isn't used
1000 if(flags&SWS_FULL_CHR_H_INT)
1001 {
1002 switch(dstFormat)
1003 {
1004 #ifdef HAVE_MMX
1005 case IMGFMT_BGR32:
1006 asm volatile(
1007
1008
1009 FULL_YSCALEYUV2RGB
1010 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1011 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1012
1013 "movq %%mm3, %%mm1 \n\t"
1014 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1015 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1016
1017 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1018 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1019
1020 "add $4, %%"REG_a" \n\t"
1021 "cmp %5, %%"REG_a" \n\t"
1022 " jb 1b \n\t"
1023
1024
1025 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1026 "m" (yalpha1), "m" (uvalpha1)
1027 : "%"REG_a
1028 );
1029 break;
1030 case IMGFMT_BGR24:
1031 asm volatile(
1032
1033 FULL_YSCALEYUV2RGB
1034
1035 // lsb ... msb
1036 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1037 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1038
1039 "movq %%mm3, %%mm1 \n\t"
1040 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1041 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1042
1043 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1044 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1045 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1046 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1047 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1048 "movq %%mm1, %%mm2 \n\t"
1049 "psllq $48, %%mm1 \n\t" // 000000BG
1050 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1051
1052 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1053 "psrld $16, %%mm2 \n\t" // R000R000
1054 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1055 "por %%mm2, %%mm1 \n\t" // RBGRR000
1056
1057 "mov %4, %%"REG_b" \n\t"
1058 "add %%"REG_a", %%"REG_b" \n\t"
1059
1060 #ifdef HAVE_MMX2
1061 //FIXME Alignment
1062 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1063 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1064 #else
1065 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1066 "psrlq $32, %%mm3 \n\t"
1067 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1068 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1069 #endif
1070 "add $4, %%"REG_a" \n\t"
1071 "cmp %5, %%"REG_a" \n\t"
1072 " jb 1b \n\t"
1073
1074 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1075 "m" (yalpha1), "m" (uvalpha1)
1076 : "%"REG_a, "%"REG_b
1077 );
1078 break;
1079 case IMGFMT_BGR15:
1080 asm volatile(
1081
1082 FULL_YSCALEYUV2RGB
1083 #ifdef DITHER1XBPP
1084 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1085 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1086 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1087 #endif
1088 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1089 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1090 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1091
1092 "psrlw $3, %%mm3 \n\t"
1093 "psllw $2, %%mm1 \n\t"
1094 "psllw $7, %%mm0 \n\t"
1095 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1096 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1097
1098 "por %%mm3, %%mm1 \n\t"
1099 "por %%mm1, %%mm0 \n\t"
1100
1101 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1102
1103 "add $4, %%"REG_a" \n\t"
1104 "cmp %5, %%"REG_a" \n\t"
1105 " jb 1b \n\t"
1106
1107 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1108 "m" (yalpha1), "m" (uvalpha1)
1109 : "%"REG_a
1110 );
1111 break;
1112 case IMGFMT_BGR16:
1113 asm volatile(
1114
1115 FULL_YSCALEYUV2RGB
1116 #ifdef DITHER1XBPP
1117 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1118 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1119 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1120 #endif
1121 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1122 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1123 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1124
1125 "psrlw $3, %%mm3 \n\t"
1126 "psllw $3, %%mm1 \n\t"
1127 "psllw $8, %%mm0 \n\t"
1128 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1129 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1130
1131 "por %%mm3, %%mm1 \n\t"
1132 "por %%mm1, %%mm0 \n\t"
1133
1134 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1135
1136 "add $4, %%"REG_a" \n\t"
1137 "cmp %5, %%"REG_a" \n\t"
1138 " jb 1b \n\t"
1139
1140 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1141 "m" (yalpha1), "m" (uvalpha1)
1142 : "%"REG_a
1143 );
1144 break;
1145 #endif
1146 case IMGFMT_RGB32:
1147 #ifndef HAVE_MMX
1148 case IMGFMT_BGR32:
1149 #endif
1150 if(dstFormat==IMGFMT_BGR32)
1151 {
1152 int i;
1153 #ifdef WORDS_BIGENDIAN
1154 dest++;
1155 #endif
1156 for(i=0;i<dstW;i++){
1157 // vertical linear interpolation && yuv2rgb in a single step:
1158 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1159 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1160 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1161 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1162 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1163 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1164 dest+= 4;
1165 }
1166 }
1167 else if(dstFormat==IMGFMT_BGR24)
1168 {
1169 int i;
1170 for(i=0;i<dstW;i++){
1171 // vertical linear interpolation && yuv2rgb in a single step:
1172 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1173 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1174 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1175 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1176 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1177 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1178 dest+= 3;
1179 }
1180 }
1181 else if(dstFormat==IMGFMT_BGR16)
1182 {
1183 int i;
1184 for(i=0;i<dstW;i++){
1185 // vertical linear interpolation && yuv2rgb in a single step:
1186 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1187 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1188 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1189
1190 ((uint16_t*)dest)[i] =
1191 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1192 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1193 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1194 }
1195 }
1196 else if(dstFormat==IMGFMT_BGR15)
1197 {
1198 int i;
1199 for(i=0;i<dstW;i++){
1200 // vertical linear interpolation && yuv2rgb in a single step:
1201 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1202 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1203 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1204
1205 ((uint16_t*)dest)[i] =
1206 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1207 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1208 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1209 }
1210 }
1211 }//FULL_UV_IPOL
1212 else
1213 {
1214 #endif // if 0
1215 #ifdef HAVE_MMX
1216 switch(c->dstFormat)
1217 {
1218 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1219 case IMGFMT_BGR32:
1220 asm volatile(
1221 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1222 "mov %4, %%"REG_b" \n\t"
1223 "push %%"REG_BP" \n\t"
1224 YSCALEYUV2RGB(%%REGBP, %5)
1225 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1226 "pop %%"REG_BP" \n\t"
1227 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1228
1229 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1230 "a" (&c->redDither)
1231 );
1232 return;
1233 case IMGFMT_BGR24:
1234 asm volatile(
1235 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1236 "mov %4, %%"REG_b" \n\t"
1237 "push %%"REG_BP" \n\t"
1238 YSCALEYUV2RGB(%%REGBP, %5)
1239 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1240 "pop %%"REG_BP" \n\t"
1241 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1242 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1243 "a" (&c->redDither)
1244 );
1245 return;
1246 case IMGFMT_BGR15:
1247 asm volatile(
1248 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1249 "mov %4, %%"REG_b" \n\t"
1250 "push %%"REG_BP" \n\t"
1251 YSCALEYUV2RGB(%%REGBP, %5)
1252 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1253 #ifdef DITHER1XBPP
1254 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1255 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1256 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1257 #endif
1258
1259 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1260 "pop %%"REG_BP" \n\t"
1261 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1262
1263 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1264 "a" (&c->redDither)
1265 );
1266 return;
1267 case IMGFMT_BGR16:
1268 asm volatile(
1269 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1270 "mov %4, %%"REG_b" \n\t"
1271 "push %%"REG_BP" \n\t"
1272 YSCALEYUV2RGB(%%REGBP, %5)
1273 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1274 #ifdef DITHER1XBPP
1275 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1276 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1277 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1278 #endif
1279
1280 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1281 "pop %%"REG_BP" \n\t"
1282 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1283 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1284 "a" (&c->redDither)
1285 );
1286 return;
1287 case IMGFMT_YUY2:
1288 asm volatile(
1289 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1290 "mov %4, %%"REG_b" \n\t"
1291 "push %%"REG_BP" \n\t"
1292 YSCALEYUV2PACKED(%%REGBP, %5)
1293 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1294 "pop %%"REG_BP" \n\t"
1295 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1296 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1297 "a" (&c->redDither)
1298 );
1299 return;
1300 default: break;
1301 }
1302 #endif //HAVE_MMX
1303 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1304 }
1305
1306 /**
1307 * YV12 to RGB without scaling or interpolating
1308 */
1309 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1310 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1311 {
1312 const int yalpha1=0;
1313 int i;
1314
1315 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1316 const int yalpha= 4096; //FIXME ...
1317
1318 if(flags&SWS_FULL_CHR_H_INT)
1319 {
1320 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1321 return;
1322 }
1323
1324 #ifdef HAVE_MMX
1325 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1326 {
1327 switch(dstFormat)
1328 {
1329 case IMGFMT_BGR32:
1330 asm volatile(
1331 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1332 "mov %4, %%"REG_b" \n\t"
1333 "push %%"REG_BP" \n\t"
1334 YSCALEYUV2RGB1(%%REGBP, %5)
1335 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1336 "pop %%"REG_BP" \n\t"
1337 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1338
1339 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1340 "a" (&c->redDither)
1341 );
1342 return;
1343 case IMGFMT_BGR24:
1344 asm volatile(
1345 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1346 "mov %4, %%"REG_b" \n\t"
1347 "push %%"REG_BP" \n\t"
1348 YSCALEYUV2RGB1(%%REGBP, %5)
1349 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1350 "pop %%"REG_BP" \n\t"
1351 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1352
1353 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1354 "a" (&c->redDither)
1355 );
1356 return;
1357 case IMGFMT_BGR15:
1358 asm volatile(
1359 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1360 "mov %4, %%"REG_b" \n\t"
1361 "push %%"REG_BP" \n\t"
1362 YSCALEYUV2RGB1(%%REGBP, %5)
1363 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1364 #ifdef DITHER1XBPP
1365 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1366 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1367 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1368 #endif
1369 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1370 "pop %%"REG_BP" \n\t"
1371 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1372
1373 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1374 "a" (&c->redDither)
1375 );
1376 return;
1377 case IMGFMT_BGR16:
1378 asm volatile(
1379 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1380 "mov %4, %%"REG_b" \n\t"
1381 "push %%"REG_BP" \n\t"
1382 YSCALEYUV2RGB1(%%REGBP, %5)
1383 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1384 #ifdef DITHER1XBPP
1385 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1386 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1387 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1388 #endif
1389
1390 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1391 "pop %%"REG_BP" \n\t"
1392 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1393
1394 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1395 "a" (&c->redDither)
1396 );
1397 return;
1398 case IMGFMT_YUY2:
1399 asm volatile(
1400 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1401 "mov %4, %%"REG_b" \n\t"
1402 "push %%"REG_BP" \n\t"
1403 YSCALEYUV2PACKED1(%%REGBP, %5)
1404 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1405 "pop %%"REG_BP" \n\t"
1406 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1407
1408 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1409 "a" (&c->redDither)
1410 );
1411 return;
1412 }
1413 }
1414 else
1415 {
1416 switch(dstFormat)
1417 {
1418 case IMGFMT_BGR32:
1419 asm volatile(
1420 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1421 "mov %4, %%"REG_b" \n\t"
1422 "push %%"REG_BP" \n\t"
1423 YSCALEYUV2RGB1b(%%REGBP, %5)
1424 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1425 "pop %%"REG_BP" \n\t"
1426 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1427
1428 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1429 "a" (&c->redDither)
1430 );
1431 return;
1432 case IMGFMT_BGR24:
1433 asm volatile(
1434 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1435 "mov %4, %%"REG_b" \n\t"
1436 "push %%"REG_BP" \n\t"
1437 YSCALEYUV2RGB1b(%%REGBP, %5)
1438 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1439 "pop %%"REG_BP" \n\t"
1440 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1441
1442 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1443 "a" (&c->redDither)
1444 );
1445 return;
1446 case IMGFMT_BGR15:
1447 asm volatile(
1448 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1449 "mov %4, %%"REG_b" \n\t"
1450 "push %%"REG_BP" \n\t"
1451 YSCALEYUV2RGB1b(%%REGBP, %5)
1452 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1453 #ifdef DITHER1XBPP
1454 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1455 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1456 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1457 #endif
1458 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1459 "pop %%"REG_BP" \n\t"
1460 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1461
1462 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1463 "a" (&c->redDither)
1464 );
1465 return;
1466 case IMGFMT_BGR16:
1467 asm volatile(
1468 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1469 "mov %4, %%"REG_b" \n\t"
1470 "push %%"REG_BP" \n\t"
1471 YSCALEYUV2RGB1b(%%REGBP, %5)
1472 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1473 #ifdef DITHER1XBPP
1474 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1475 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1476 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1477 #endif
1478
1479 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1480 "pop %%"REG_BP" \n\t"
1481 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1482
1483 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1484 "a" (&c->redDither)
1485 );
1486 return;
1487 case IMGFMT_YUY2:
1488 asm volatile(
1489 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1490 "mov %4, %%"REG_b" \n\t"
1491 "push %%"REG_BP" \n\t"
1492 YSCALEYUV2PACKED1b(%%REGBP, %5)
1493 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1494 "pop %%"REG_BP" \n\t"
1495 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1496
1497 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1498 "a" (&c->redDither)
1499 );
1500 return;
1501 }
1502 }
1503 #endif
1504 if( uvalpha < 2048 )
1505 {
1506 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1507 }else{
1508 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1509 }
1510 }
1511
1512 //FIXME yuy2* can read upto 7 samples to much
1513
1514 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1515 {
1516 #ifdef HAVE_MMX
1517 asm volatile(
1518 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1519 "mov %0, %%"REG_a" \n\t"
1520 "1: \n\t"
1521 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1522 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1523 "pand %%mm2, %%mm0 \n\t"
1524 "pand %%mm2, %%mm1 \n\t"
1525 "packuswb %%mm1, %%mm0 \n\t"
1526 "movq %%mm0, (%2, %%"REG_a") \n\t"
1527 "add $8, %%"REG_a" \n\t"
1528 " js 1b \n\t"
1529 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1530 : "%"REG_a
1531 );
1532 #else
1533 int i;
1534 for(i=0; i<width; i++)
1535 dst[i]= src[2*i];
1536 #endif
1537 }
1538
1539 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1540 {
1541 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1542 asm volatile(
1543 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1544 "mov %0, %%"REG_a" \n\t"
1545 "1: \n\t"
1546 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1547 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1548 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1549 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1550 PAVGB(%%mm2, %%mm0)
1551 PAVGB(%%mm3, %%mm1)
1552 "psrlw $8, %%mm0 \n\t"
1553 "psrlw $8, %%mm1 \n\t"
1554 "packuswb %%mm1, %%mm0 \n\t"
1555 "movq %%mm0, %%mm1 \n\t"
1556 "psrlw $8, %%mm0 \n\t"
1557 "pand %%mm4, %%mm1 \n\t"
1558 "packuswb %%mm0, %%mm0 \n\t"
1559 "packuswb %%mm1, %%mm1 \n\t"
1560 "movd %%mm0, (%4, %%"REG_a") \n\t"
1561 "movd %%mm1, (%3, %%"REG_a") \n\t"
1562 "add $4, %%"REG_a" \n\t"
1563 " js 1b \n\t"
1564 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1565 : "%"REG_a
1566 );
1567 #else
1568 int i;
1569 for(i=0; i<width; i++)
1570 {
1571 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1572 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1573 }
1574 #endif
1575 }
1576
1577 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1578 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1579 {
1580 #ifdef HAVE_MMX
1581 asm volatile(
1582 "mov %0, %%"REG_a" \n\t"
1583 "1: \n\t"
1584 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1585 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1586 "psrlw $8, %%mm0 \n\t"
1587 "psrlw $8, %%mm1 \n\t"
1588 "packuswb %%mm1, %%mm0 \n\t"
1589 "movq %%mm0, (%2, %%"REG_a") \n\t"
1590 "add $8, %%"REG_a" \n\t"
1591 " js 1b \n\t"
1592 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1593 : "%"REG_a
1594 );
1595 #else
1596 int i;
1597 for(i=0; i<width; i++)
1598 dst[i]= src[2*i+1];
1599 #endif
1600 }
1601
1602 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1603 {
1604 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1605 asm volatile(
1606 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1607 "mov %0, %%"REG_a" \n\t"
1608 "1: \n\t"
1609 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1610 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1611 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1612 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1613 PAVGB(%%mm2, %%mm0)
1614 PAVGB(%%mm3, %%mm1)
1615 "pand %%mm4, %%mm0 \n\t"
1616 "pand %%mm4, %%mm1 \n\t"
1617 "packuswb %%mm1, %%mm0 \n\t"
1618 "movq %%mm0, %%mm1 \n\t"
1619 "psrlw $8, %%mm0 \n\t"
1620 "pand %%mm4, %%mm1 \n\t"
1621 "packuswb %%mm0, %%mm0 \n\t"
1622 "packuswb %%mm1, %%mm1 \n\t"
1623 "movd %%mm0, (%4, %%"REG_a") \n\t"
1624 "movd %%mm1, (%3, %%"REG_a") \n\t"
1625 "add $4, %%"REG_a" \n\t"
1626 " js 1b \n\t"
1627 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1628 : "%"REG_a
1629 );
1630 #else
1631 int i;
1632 for(i=0; i<width; i++)
1633 {
1634 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1635 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1636 }
1637 #endif
1638 }
1639
1640 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1641 {
1642 int i;
1643 for(i=0; i<width; i++)
1644 {
1645 int b= ((uint32_t*)src)[i]&0xFF;
1646 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1647 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1648
1649 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1650 }
1651 }
1652
1653 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1654 {
1655 int i;
1656 for(i=0; i<width; i++)
1657 {
1658 const int a= ((uint32_t*)src1)[2*i+0];
1659 const int e= ((uint32_t*)src1)[2*i+1];
1660 const int c= ((uint32_t*)src2)[2*i+0];
1661 const int d= ((uint32_t*)src2)[2*i+1];
1662 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1663 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1664 const int b= l&0x3FF;
1665 const int g= h>>8;
1666 const int r= l>>16;
1667
1668 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1669 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1670 }
1671 }
1672
1673 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1674 {
1675 #ifdef HAVE_MMX
1676 asm volatile(
1677 "mov %2, %%"REG_a" \n\t"
1678 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1679 "movq "MANGLE(w1111)", %%mm5 \n\t"
1680 "pxor %%mm7, %%mm7 \n\t"
1681 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1682 ASMALIGN16
1683 "1: \n\t"
1684 PREFETCH" 64(%0, %%"REG_b") \n\t"
1685 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1686 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
1687 "punpcklbw %%mm7, %%mm0 \n\t"
1688 "punpcklbw %%mm7, %%mm1 \n\t"
1689 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
1690 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
1691 "punpcklbw %%mm7, %%mm2 \n\t"
1692 "punpcklbw %%mm7, %%mm3 \n\t"
1693 "pmaddwd %%mm6, %%mm0 \n\t"
1694 "pmaddwd %%mm6, %%mm1 \n\t"
1695 "pmaddwd %%mm6, %%mm2 \n\t"
1696 "pmaddwd %%mm6, %%mm3 \n\t"
1697 #ifndef FAST_BGR2YV12
1698 "psrad $8, %%mm0 \n\t"
1699 "psrad $8, %%mm1 \n\t"
1700 "psrad $8, %%mm2 \n\t"
1701 "psrad $8, %%mm3 \n\t"
1702 #endif
1703 "packssdw %%mm1, %%mm0 \n\t"
1704 "packssdw %%mm3, %%mm2 \n\t"
1705 "pmaddwd %%mm5, %%mm0 \n\t"
1706 "pmaddwd %%mm5, %%mm2 \n\t"
1707 "packssdw %%mm2, %%mm0 \n\t"
1708 "psraw $7, %%mm0 \n\t"
1709
1710 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1711 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
1712 "punpcklbw %%mm7, %%mm4 \n\t"
1713 "punpcklbw %%mm7, %%mm1 \n\t"
1714 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
1715 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
1716 "punpcklbw %%mm7, %%mm2 \n\t"
1717 "punpcklbw %%mm7, %%mm3 \n\t"
1718 "pmaddwd %%mm6, %%mm4 \n\t"
1719 "pmaddwd %%mm6, %%mm1 \n\t"
1720 "pmaddwd %%mm6, %%mm2 \n\t"
1721 "pmaddwd %%mm6, %%mm3 \n\t"
1722 #ifndef FAST_BGR2YV12
1723 "psrad $8, %%mm4 \n\t"
1724 "psrad $8, %%mm1 \n\t"
1725 "psrad $8, %%mm2 \n\t"
1726 "psrad $8, %%mm3 \n\t"
1727 #endif
1728 "packssdw %%mm1, %%mm4 \n\t"
1729 "packssdw %%mm3, %%mm2 \n\t"
1730 "pmaddwd %%mm5, %%mm4 \n\t"
1731 "pmaddwd %%mm5, %%mm2 \n\t"
1732 "add $24, %%"REG_b" \n\t"
1733 "packssdw %%mm2, %%mm4 \n\t"
1734 "psraw $7, %%mm4 \n\t"
1735
1736 "packuswb %%mm4, %%mm0 \n\t"
1737 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1738
1739 "movq %%mm0, (%1, %%"REG_a") \n\t"
1740 "add $8, %%"REG_a" \n\t"
1741 " js 1b \n\t"
1742 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1743 : "%"REG_a, "%"REG_b
1744 );
1745 #else
1746 int i;
1747 for(i=0; i<width; i++)
1748 {
1749 int b= src[i*3+0];
1750 int g= src[i*3+1];
1751 int r= src[i*3+2];
1752
1753 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1754 }
1755 #endif
1756 }
1757
1758 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1759 {
1760 #ifdef HAVE_MMX
1761 asm volatile(
1762 "mov %4, %%"REG_a" \n\t"
1763 "movq "MANGLE(w1111)", %%mm5 \n\t"
1764 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1765 "pxor %%mm7, %%mm7 \n\t"
1766 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t"
1767 "add %%"REG_b", %%"REG_b" \n\t"
1768 ASMALIGN16
1769 "1: \n\t"
1770 PREFETCH" 64(%0, %%"REG_b") \n\t"
1771 PREFETCH" 64(%1, %%"REG_b") \n\t"
1772 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1773 "movq (%0, %%"REG_b"), %%mm0 \n\t"
1774 "movq (%1, %%"REG_b"), %%mm1 \n\t"
1775 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
1776 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
1777 PAVGB(%%mm1, %%mm0)
1778 PAVGB(%%mm3, %%mm2)
1779 "movq %%mm0, %%mm1 \n\t"
1780 "movq %%mm2, %%mm3 \n\t"
1781 "psrlq $24, %%mm0 \n\t"
1782 "psrlq $24, %%mm2 \n\t"
1783 PAVGB(%%mm1, %%mm0)
1784 PAVGB(%%mm3, %%mm2)
1785 "punpcklbw %%mm7, %%mm0 \n\t"
1786 "punpcklbw %%mm7, %%mm2 \n\t"
1787 #else
1788 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1789 "movd (%1, %%"REG_b"), %%mm1 \n\t"
1790 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
1791 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
1792 "punpcklbw %%mm7, %%mm0 \n\t"
1793 "punpcklbw %%mm7, %%mm1 \n\t"
1794 "punpcklbw %%mm7, %%mm2 \n\t"
1795 "punpcklbw %%mm7, %%mm3 \n\t"
1796 "paddw %%mm1, %%mm0 \n\t"
1797 "paddw %%mm3, %%mm2 \n\t"
1798 "paddw %%mm2, %%mm0 \n\t"
1799 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
1800 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
1801 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
1802 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
1803 "punpcklbw %%mm7, %%mm4 \n\t"
1804 "punpcklbw %%mm7, %%mm1 \n\t"
1805 "punpcklbw %%mm7, %%mm2 \n\t"
1806 "punpcklbw %%mm7, %%mm3 \n\t"
1807 "paddw %%mm1, %%mm4 \n\t"
1808 "paddw %%mm3, %%mm2 \n\t"
1809 "paddw %%mm4, %%mm2 \n\t"
1810 "psrlw $2, %%mm0 \n\t"
1811 "psrlw $2, %%mm2 \n\t"
1812 #endif
1813 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1814 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1815
1816 "pmaddwd %%mm0, %%mm1 \n\t"
1817 "pmaddwd %%mm2, %%mm3 \n\t"
1818 "pmaddwd %%mm6, %%mm0 \n\t"
1819 "pmaddwd %%mm6, %%mm2 \n\t"
1820 #ifndef FAST_BGR2YV12
1821 "psrad $8, %%mm0 \n\t"
1822 "psrad $8, %%mm1 \n\t"
1823 "psrad $8, %%mm2 \n\t"
1824 "psrad $8, %%mm3 \n\t"
1825 #endif
1826 "packssdw %%mm2, %%mm0 \n\t"
1827 "packssdw %%mm3, %%mm1 \n\t"
1828 "pmaddwd %%mm5, %%mm0 \n\t"
1829 "pmaddwd %%mm5, %%mm1 \n\t"
1830 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1831 "psraw $7, %%mm0 \n\t"
1832
1833 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1834 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
1835 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
1836 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
1837 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
1838 PAVGB(%%mm1, %%mm4)
1839 PAVGB(%%mm3, %%mm2)
1840 "movq %%mm4, %%mm1 \n\t"
1841 "movq %%mm2, %%mm3 \n\t"
1842 "psrlq $24, %%mm4 \n\t"
1843 "psrlq $24, %%mm2 \n\t"
1844 PAVGB(%%mm1, %%mm4)
1845 PAVGB(%%mm3, %%mm2)
1846 "punpcklbw %%mm7, %%mm4 \n\t"
1847 "punpcklbw %%mm7, %%mm2 \n\t"
1848 #else
1849 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1850 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
1851 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
1852 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
1853 "punpcklbw %%mm7, %%mm4 \n\t"
1854 "punpcklbw %%mm7, %%mm1 \n\t"
1855 "punpcklbw %%mm7, %%mm2 \n\t"
1856 "punpcklbw %%mm7, %%mm3 \n\t"
1857 "paddw %%mm1, %%mm4 \n\t"
1858 "paddw %%mm3, %%mm2 \n\t"
1859 "paddw %%mm2, %%mm4 \n\t"
1860 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
1861 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
1862 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
1863 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
1864 "punpcklbw %%mm7, %%mm5 \n\t"
1865 "punpcklbw %%mm7, %%mm1 \n\t"
1866 "punpcklbw %%mm7, %%mm2 \n\t"
1867 "punpcklbw %%mm7, %%mm3 \n\t"
1868 "paddw %%mm1, %%mm5 \n\t"
1869 "paddw %%mm3, %%mm2 \n\t"
1870 "paddw %%mm5, %%mm2 \n\t"
1871 "movq "MANGLE(w1111)", %%mm5 \n\t"
1872 "psrlw $2, %%mm4 \n\t"
1873 "psrlw $2, %%mm2 \n\t"
1874 #endif
1875 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1876 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1877
1878 "pmaddwd %%mm4, %%mm1 \n\t"
1879 "pmaddwd %%mm2, %%mm3 \n\t"
1880 "pmaddwd %%mm6, %%mm4 \n\t"
1881 "pmaddwd %%mm6, %%mm2 \n\t"
1882 #ifndef FAST_BGR2YV12
1883 "psrad $8, %%mm4 \n\t"
1884 "psrad $8, %%mm1 \n\t"
1885 "psrad $8, %%mm2 \n\t"
1886 "psrad $8, %%mm3 \n\t"
1887 #endif
1888 "packssdw %%mm2, %%mm4 \n\t"
1889 "packssdw %%mm3, %%mm1 \n\t"
1890 "pmaddwd %%mm5, %%mm4 \n\t"
1891 "pmaddwd %%mm5, %%mm1 \n\t"
1892 "add $24, %%"REG_b" \n\t"
1893 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1894 "psraw $7, %%mm4 \n\t"
1895
1896 "movq %%mm0, %%mm1 \n\t"
1897 "punpckldq %%mm4, %%mm0 \n\t"
1898 "punpckhdq %%mm4, %%mm1 \n\t"
1899 "packsswb %%mm1, %%mm0 \n\t"
1900 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1901
1902 "movd %%mm0, (%2, %%"REG_a") \n\t"
1903 "punpckhdq %%mm0, %%mm0 \n\t"
1904 "movd %%mm0, (%3, %%"REG_a") \n\t"
1905 "add $4, %%"REG_a" \n\t"
1906 " js 1b \n\t"
1907 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1908 : "%"REG_a, "%"REG_b
1909 );
1910 #else
1911 int i;
1912 for(i=0; i<width; i++)
1913 {
1914 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1915 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1916 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1917
1918 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1919 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1920 }
1921 #endif
1922 }
1923
1924 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1925 {
1926 int i;
1927 for(i=0; i<width; i++)
1928 {
1929 int d= ((uint16_t*)src)[i];
1930 int b= d&0x1F;
1931 int g= (d>>5)&0x3F;
1932 int r= (d>>11)&0x1F;
1933
1934 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1935 }
1936 }
1937
1938 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1939 {
1940 int i;
1941 for(i=0; i<width; i++)
1942 {
1943 int d0= ((uint32_t*)src1)[i];
1944 int d1= ((uint32_t*)src2)[i];
1945
1946 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1947 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1948
1949 int dh2= (dh>>11) + (dh<<21);
1950 int d= dh2 + dl;
1951
1952 int b= d&0x7F;
1953 int r= (d>>11)&0x7F;
1954 int g= d>>21;
1955 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1956 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1957 }
1958 }
1959
1960 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1961 {
1962 int i;
1963 for(i=0; i<width; i++)
1964 {
1965 int d= ((uint16_t*)src)[i];
1966 int b= d&0x1F;
1967 int g= (d>>5)&0x1F;
1968 int r= (d>>10)&0x1F;
1969
1970 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1971 }
1972 }
1973
1974 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1975 {
1976 int i;
1977 for(i=0; i<width; i++)
1978 {
1979 int d0= ((uint32_t*)src1)[i];
1980 int d1= ((uint32_t*)src2)[i];
1981
1982 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1983 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1984
1985 int dh2= (dh>>11) + (dh<<21);
1986 int d= dh2 + dl;
1987
1988 int b= d&0x7F;
1989 int r= (d>>10)&0x7F;
1990 int g= d>>21;
1991 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1992 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1993 }
1994 }
1995
1996
1997 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1998 {
1999 int i;
2000 for(i=0; i<width; i++)
2001 {
2002 int r= ((uint32_t*)src)[i]&0xFF;
2003 int g= (((uint32_t*)src)[i]>>8)&0xFF;
2004 int b= (((uint32_t*)src)[i]>>16)&0xFF;
2005
2006 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2007 }
2008 }
2009
2010 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2011 {
2012 int i;
2013 for(i=0; i<width; i++)
2014 {
2015 const int a= ((uint32_t*)src1)[2*i+0];
2016 const int e= ((uint32_t*)src1)[2*i+1];
2017 const int c= ((uint32_t*)src2)[2*i+0];
2018 const int d= ((uint32_t*)src2)[2*i+1];
2019 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
2020 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
2021 const int r= l&0x3FF;
2022 const int g= h>>8;
2023 const int b= l>>16;
2024
2025 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2026 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2027 }
2028 }
2029
2030 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2031 {
2032 int i;
2033 for(i=0; i<width; i++)
2034 {
2035 int r= src[i*3+0];
2036 int g= src[i*3+1];
2037 int b= src[i*3+2];
2038
2039 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2040 }
2041 }
2042
2043 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2044 {
2045 int i;
2046 for(i=0; i<width; i++)
2047 {
2048 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2049 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2050 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2051
2052 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2053 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2054 }
2055 }
2056
2057
2058 // Bilinear / Bicubic scaling
2059 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2060 int16_t *filter, int16_t *filterPos, long filterSize)
2061 {
2062 #ifdef HAVE_MMX
2063 assert(filterSize % 4 == 0 && filterSize>0);
2064 if(filterSize==4) // allways true for upscaling, sometimes for down too
2065 {
2066 long counter= -2*dstW;
2067 filter-= counter*2;
2068 filterPos-= counter/2;
2069 dst-= counter/2;
2070 asm volatile(
2071 "pxor %%mm7, %%mm7 \n\t"
2072 "movq "MANGLE(w02)", %%mm6 \n\t"
2073 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2074 "mov %%"REG_a", %%"REG_BP" \n\t"
2075 ASMALIGN16
2076 "1: \n\t"
2077 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2078 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2079 "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2080 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2081 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2082 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2083 "punpcklbw %%mm7, %%mm0 \n\t"
2084 "punpcklbw %%mm7, %%mm2 \n\t"
2085 "pmaddwd %%mm1, %%mm0 \n\t"
2086 "pmaddwd %%mm2, %%mm3 \n\t"
2087 "psrad $8, %%mm0 \n\t"
2088 "psrad $8, %%mm3 \n\t"
2089 "packssdw %%mm3, %%mm0 \n\t"
2090 "pmaddwd %%mm6, %%mm0 \n\t"
2091 "packssdw %%mm0, %%mm0 \n\t"
2092 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2093 "add $4, %%"REG_BP" \n\t"
2094 " jnc 1b \n\t"
2095
2096 "pop %%"REG_BP" \n\t"
2097 : "+a" (counter)
2098 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2099 : "%"REG_b
2100 );
2101 }
2102 else if(filterSize==8)
2103 {
2104 long counter= -2*dstW;
2105 filter-= counter*4;
2106 filterPos-= counter/2;
2107 dst-= counter/2;
2108 asm volatile(
2109 "pxor %%mm7, %%mm7 \n\t"
2110 "movq "MANGLE(w02)", %%mm6 \n\t"
2111 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2112 "mov %%"REG_a", %%"REG_BP" \n\t"
2113 ASMALIGN16
2114 "1: \n\t"
2115 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2116 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2117 "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2118 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2119 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2120 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2121 "punpcklbw %%mm7, %%mm0 \n\t"
2122 "punpcklbw %%mm7, %%mm2 \n\t"
2123 "pmaddwd %%mm1, %%mm0 \n\t"
2124 "pmaddwd %%mm2, %%mm3 \n\t"
2125
2126 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2127 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2128 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2129 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2130 "punpcklbw %%mm7, %%mm4 \n\t"
2131 "punpcklbw %%mm7, %%mm2 \n\t"
2132 "pmaddwd %%mm1, %%mm4 \n\t"
2133 "pmaddwd %%mm2, %%mm5 \n\t"
2134 "paddd %%mm4, %%mm0 \n\t"
2135 "paddd %%mm5, %%mm3 \n\t"
2136
2137 "psrad $8, %%mm0 \n\t"
2138 "psrad $8, %%mm3 \n\t"
2139 "packssdw %%mm3, %%mm0 \n\t"
2140 "pmaddwd %%mm6, %%mm0 \n\t"
2141 "packssdw %%mm0, %%mm0 \n\t"
2142 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2143 "add $4, %%"REG_BP" \n\t"
2144 " jnc 1b \n\t"
2145
2146 "pop %%"REG_BP" \n\t"
2147 : "+a" (counter)
2148 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2149 : "%"REG_b
2150 );
2151 }
2152 else
2153 {
2154 uint8_t *offset = src+filterSize;
2155 long counter= -2*dstW;
2156 // filter-= counter*filterSize/2;
2157 filterPos-= counter/2;
2158 dst-= counter/2;
2159 asm volatile(
2160 "pxor %%mm7, %%mm7 \n\t"
2161 "movq "MANGLE(w02)", %%mm6 \n\t"
2162 ASMALIGN16
2163 "1: \n\t"
2164 "mov %2, %%"REG_c" \n\t"
2165 "movzwl (%%"REG_c", %0), %%eax \n\t"
2166 "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2167 "mov %5, %%"REG_c" \n\t"
2168 "pxor %%mm4, %%mm4 \n\t"
2169 "pxor %%mm5, %%mm5 \n\t"
2170 "2: \n\t"
2171 "movq (%1), %%mm1 \n\t"
2172 "movq (%1, %6), %%mm3 \n\t"
2173 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2174 "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2175 "punpcklbw %%mm7, %%mm0 \n\t"
2176 "punpcklbw %%mm7, %%mm2 \n\t"
2177 "pmaddwd %%mm1, %%mm0 \n\t"
2178 "pmaddwd %%mm2, %%mm3 \n\t"
2179 "paddd %%mm3, %%mm5 \n\t"
2180 "paddd %%mm0, %%mm4 \n\t"
2181 "add $8, %1 \n\t"
2182 "add $4, %%"REG_c" \n\t"
2183 "cmp %4, %%"REG_c" \n\t"
2184 " jb 2b \n\t"
2185 "add %6, %1 \n\t"
2186 "psrad $8, %%mm4 \n\t"
2187 "psrad $8, %%mm5 \n\t"
2188 "packssdw %%mm5, %%mm4 \n\t"
2189 "pmaddwd %%mm6, %%mm4 \n\t"
2190 "packssdw %%mm4, %%mm4 \n\t"
2191 "mov %3, %%"REG_a" \n\t"
2192 "movd %%mm4, (%%"REG_a", %0) \n\t"
2193 "add $4, %0 \n\t"
2194 " jnc 1b \n\t"
2195
2196 : "+r" (counter), "+r" (filter)
2197 : "m" (filterPos), "m" (dst), "m"(offset),
2198 "m" (src), "r" (filterSize*2)
2199 : "%"REG_b, "%"REG_a, "%"REG_c
2200 );
2201 }
2202 #else
2203 #ifdef HAVE_ALTIVEC
2204 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2205 #else
2206 int i;
2207 for(i=0; i<dstW; i++)
2208 {
2209 int j;
2210 int srcPos= filterPos[i];
2211 int val=0;
2212 // printf("filterPos: %d\n", filterPos[i]);
2213 for(j=0; j<filterSize; j++)
2214 {
2215 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2216 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2217 }
2218 // filter += hFilterSize;
2219 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2220 // dst[i] = val>>7;
2221 }
2222 #endif
2223 #endif
2224 }
2225 // *** horizontal scale Y line to temp buffer
2226 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2227 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2228 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2229 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2230 int32_t *mmx2FilterPos)
2231 {
2232 if(srcFormat==IMGFMT_YUY2)
2233 {
2234 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2235 src= formatConvBuffer;
2236 }
2237 else if(srcFormat==IMGFMT_UYVY)
2238 {
2239 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2240 src= formatConvBuffer;
2241 }
2242 else if(srcFormat==IMGFMT_BGR32)
2243 {
2244 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2245 src= formatConvBuffer;
2246 }
2247 else if(srcFormat==IMGFMT_BGR24)
2248 {
2249 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2250 src= formatConvBuffer;
2251 }
2252 else if(srcFormat==IMGFMT_BGR16)
2253 {
2254 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2255 src= formatConvBuffer;
2256 }
2257 else if(srcFormat==IMGFMT_BGR15)
2258 {
2259 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2260 src= formatConvBuffer;
2261 }
2262 else if(srcFormat==IMGFMT_RGB32)
2263 {
2264 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2265 src= formatConvBuffer;
2266 }
2267 else if(srcFormat==IMGFMT_RGB24)
2268 {
2269 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2270 src= formatConvBuffer;
2271 }
2272
2273 #ifdef HAVE_MMX
2274 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2275 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2276 #else
2277 if(!(flags&SWS_FAST_BILINEAR))
2278 #endif
2279 {
2280 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2281 }
2282 else // Fast Bilinear upscale / crap downscale
2283 {
2284 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2285 #ifdef HAVE_MMX2
2286 int i;
2287 if(canMMX2BeUsed)
2288 {
2289 asm volatile(
2290 "pxor %%mm7, %%mm7 \n\t"
2291 "mov %0, %%"REG_c" \n\t"
2292 "mov %1, %%"REG_D" \n\t"
2293 "mov %2, %%"REG_d" \n\t"
2294 "mov %3, %%"REG_b" \n\t"
2295 "xor %%"REG_a", %%"REG_a" \n\t" // i
2296 PREFETCH" (%%"REG_c") \n\t"
2297 PREFETCH" 32(%%"REG_c") \n\t"
2298 PREFETCH" 64(%%"REG_c") \n\t"
2299
2300 #ifdef ARCH_X86_64
2301
2302 #define FUNNY_Y_CODE \
2303 "movl (%%"REG_b"), %%esi \n\t"\
2304 "call *%4 \n\t"\
2305 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2306 "add %%"REG_S", %%"REG_c" \n\t"\
2307 "add %%"REG_a", %%"REG_D" \n\t"\
2308 "xor %%"REG_a", %%"REG_a" \n\t"\
2309
2310 #else
2311
2312 #define FUNNY_Y_CODE \
2313 "movl (%%"REG_b"), %%esi \n\t"\
2314 "call *%4 \n\t"\
2315 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2316 "add %%"REG_a", %%"REG_D" \n\t"\
2317 "xor %%"REG_a", %%"REG_a" \n\t"\
2318
2319 #endif
2320
2321 FUNNY_Y_CODE
2322 FUNNY_Y_CODE
2323 FUNNY_Y_CODE
2324 FUNNY_Y_CODE
2325 FUNNY_Y_CODE
2326 FUNNY_Y_CODE
2327 FUNNY_Y_CODE
2328 FUNNY_Y_CODE
2329
2330 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2331 "m" (funnyYCode)
2332 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2333 );
2334 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2335 }
2336 else
2337 {
2338 #endif
2339 long xInc_shr16 = xInc >> 16;
2340 uint16_t xInc_mask = xInc & 0xffff;
2341 //NO MMX just normal asm ...
2342 asm volatile(
2343 "xor %%"REG_a", %%"REG_a" \n\t" // i
2344 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2345 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2346 ASMALIGN16
2347 "1: \n\t"
2348 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2349 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2350 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2351 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2352 "shll $16, %%edi \n\t"
2353 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2354 "mov %1, %%"REG_D" \n\t"
2355 "shrl $9, %%esi \n\t"
2356 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2357 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2358 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2359
2360 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2361 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2362 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2363 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2364 "shll $16, %%edi \n\t"
2365 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2366 "mov %1, %%"REG_D" \n\t"
2367 "shrl $9, %%esi \n\t"
2368 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2369 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2370 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2371
2372
2373 "add $2, %%"REG_a" \n\t"
2374 "cmp %2, %%"REG_a" \n\t"
2375 " jb 1b \n\t"
2376
2377
2378 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2379 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2380 );
2381 #ifdef HAVE_MMX2
2382 } //if MMX2 can't be used
2383 #endif
2384 #else
2385 int i;
2386 unsigned int xpos=0;
2387 for(i=0;i<dstWidth;i++)
2388 {
2389 register unsigned int xx=xpos>>16;
2390 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2391 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2392 xpos+=xInc;
2393 }
2394 #endif
2395 }
2396 }
2397
2398 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2399 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2400 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2401 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2402 int32_t *mmx2FilterPos)
2403 {
2404 if(srcFormat==IMGFMT_YUY2)
2405 {
2406 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2407 src1= formatConvBuffer;
2408 src2= formatConvBuffer+2048;
2409 }
2410 else if(srcFormat==IMGFMT_UYVY)
2411 {
2412 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2413 src1= formatConvBuffer;
2414 src2= formatConvBuffer+2048;
2415 }
2416 else if(srcFormat==IMGFMT_BGR32)
2417 {
2418 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2419 src1= formatConvBuffer;
2420 src2= formatConvBuffer+2048;
2421 }
2422 else if(srcFormat==IMGFMT_BGR24)
2423 {
2424 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2425 src1= formatConvBuffer;
2426 src2= formatConvBuffer+2048;
2427 }
2428 else if(srcFormat==IMGFMT_BGR16)
2429 {
2430 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2431 src1= formatConvBuffer;
2432 src2= formatConvBuffer+2048;
2433 }
2434 else if(srcFormat==IMGFMT_BGR15)
2435 {
2436 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2437 src1= formatConvBuffer;
2438 src2= formatConvBuffer+2048;
2439 }
2440 else if(srcFormat==IMGFMT_RGB32)
2441 {
2442 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2443 src1= formatConvBuffer;
2444 src2= formatConvBuffer+2048;
2445 }
2446 else if(srcFormat==IMGFMT_RGB24)
2447 {
2448 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2449 src1= formatConvBuffer;
2450 src2= formatConvBuffer+2048;
2451 }
2452 else if(isGray(srcFormat))
2453 {
2454 return;
2455 }
2456
2457 #ifdef HAVE_MMX
2458 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2459 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2460 #else
2461 if(!(flags&SWS_FAST_BILINEAR))
2462 #endif
2463 {
2464 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2465 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2466 }
2467 else // Fast Bilinear upscale / crap downscale
2468 {
2469 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2470 #ifdef HAVE_MMX2
2471 int i;
2472 if(canMMX2BeUsed)
2473 {
2474 asm volatile(
2475 "pxor %%mm7, %%mm7 \n\t"
2476 "mov %0, %%"REG_c" \n\t"
2477 "mov %1, %%"REG_D" \n\t"
2478 "mov %2, %%"REG_d" \n\t"
2479 "mov %3, %%"REG_b" \n\t"
2480 "xor %%"REG_a", %%"REG_a" \n\t" // i
2481 PREFETCH" (%%"REG_c") \n\t"
2482 PREFETCH" 32(%%"REG_c") \n\t"
2483 PREFETCH" 64(%%"REG_c") \n\t"
2484
2485 #ifdef ARCH_X86_64
2486
2487 #define FUNNY_UV_CODE \
2488 "movl (%%"REG_b"), %%esi \n\t"\
2489 "call *%4 \n\t"\
2490 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2491 "add %%"REG_S", %%"REG_c" \n\t"\
2492 "add %%"REG_a", %%"REG_D" \n\t"\
2493 "xor %%"REG_a", %%"REG_a" \n\t"\
2494
2495 #else
2496
2497 #define FUNNY_UV_CODE \
2498 "movl (%%"REG_b"), %%esi \n\t"\
2499 "call *%4 \n\t"\
2500 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2501 "add %%"REG_a", %%"REG_D" \n\t"\
2502 "xor %%"REG_a", %%"REG_a" \n\t"\
2503
2504 #endif
2505
2506 FUNNY_UV_CODE
2507 FUNNY_UV_CODE
2508 FUNNY_UV_CODE
2509 FUNNY_UV_CODE
2510 "xor %%"REG_a", %%"REG_a" \n\t" // i
2511 "mov %5, %%"REG_c" \n\t" // src
2512 "mov %1, %%"REG_D" \n\t" // buf1
2513 "add $4096, %%"REG_D" \n\t"
2514 PREFETCH" (%%"REG_c") \n\t"
2515 PREFETCH" 32(%%"REG_c") \n\t"
2516 PREFETCH" 64(%%"REG_c") \n\t"
2517
2518 FUNNY_UV_CODE
2519 FUNNY_UV_CODE
2520 FUNNY_UV_CODE
2521 FUNNY_UV_CODE
2522
2523 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2524 "m" (funnyUVCode), "m" (src2)
2525 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2526 );
2527 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2528 {
2529 // printf("%d %d %d\n", dstWidth, i, srcW);
2530 dst[i] = src1[srcW-1]*128;
2531 dst[i+2048] = src2[srcW-1]*128;
2532 }
2533 }
2534 else
2535 {
2536 #endif
2537 long xInc_shr16 = (long) (xInc >> 16);
2538 uint16_t xInc_mask = xInc & 0xffff;
2539 asm volatile(
2540 "xor %%"REG_a", %%"REG_a" \n\t" // i
2541 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2542 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2543 ASMALIGN16
2544 "1: \n\t"
2545 "mov %0, %%"REG_S" \n\t"
2546 "movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx]
2547 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi \n\t" //src[xx+1]
2548 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2549 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2550 "shll $16, %%edi \n\t"
2551 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2552 "mov %1, %%"REG_D" \n\t"
2553 "shrl $9, %%esi \n\t"
2554 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2555
2556 "movzbl (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2557 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2558 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2559 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2560 "shll $16, %%edi \n\t"
2561 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2562 "mov %1, %%"REG_D" \n\t"
2563 "shrl $9, %%esi \n\t"
2564 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2565
2566 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2567 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2568 "add $1, %%"REG_a" \n\t"
2569 "cmp %2, %%"REG_a" \n\t"
2570 " jb 1b \n\t"
2571
2572 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2573 which is needed to support GCC-4.0 */
2574 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2575 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2576 #else
2577 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2578 #endif
2579 "r" (src2)
2580 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2581 );
2582 #ifdef HAVE_MMX2
2583 } //if MMX2 can't be used
2584 #endif
2585 #else
2586 int i;
2587 unsigned int xpos=0;
2588 for(i=0;i<dstWidth;i++)
2589 {
2590 register unsigned int xx=xpos>>16;
2591 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2592 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2593 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2594 /* slower
2595 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2596 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2597 */
2598 xpos+=xInc;
2599 }
2600 #endif
2601 }
2602 }
2603
2604 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2605 int srcSliceH, uint8_t* dst[], int dstStride[]){
2606
2607 /* load a few things into local vars to make the code more readable? and faster */
2608 const int srcW= c->srcW;
2609 const int dstW= c->dstW;
2610 const int dstH= c->dstH;
2611 const int chrDstW= c->chrDstW;
2612 const int chrSrcW= c->chrSrcW;
2613 const int lumXInc= c->lumXInc;
2614 const int chrXInc= c->chrXInc;
2615 const int dstFormat= c->dstFormat;
2616 const int srcFormat= c->srcFormat;
2617 const int flags= c->flags;
2618 const int canMMX2BeUsed= c->canMMX2BeUsed;
2619 int16_t *vLumFilterPos= c->vLumFilterPos;
2620 int16_t *vChrFilterPos= c->vChrFilterPos;
2621 int16_t *hLumFilterPos= c->hLumFilterPos;
2622 int16_t *hChrFilterPos= c->hChrFilterPos;
2623 int16_t *vLumFilter= c->vLumFilter;
2624 int16_t *vChrFilter= c->vChrFilter;
2625 int16_t *hLumFilter= c->hLumFilter;
2626 int16_t *hChrFilter= c->hChrFilter;
2627 int32_t *lumMmxFilter= c->lumMmxFilter;
2628 int32_t *chrMmxFilter= c->chrMmxFilter;
2629 const int vLumFilterSize= c->vLumFilterSize;
2630 const int vChrFilterSize= c->vChrFilterSize;
2631 const int hLumFilterSize= c->hLumFilterSize;
2632 const int hChrFilterSize= c->hChrFilterSize;
2633 int16_t **lumPixBuf= c->lumPixBuf;
2634 int16_t **chrPixBuf= c->chrPixBuf;
2635 const int vLumBufSize= c->vLumBufSize;
2636 const int vChrBufSize= c->vChrBufSize;
2637 uint8_t *funnyYCode= c->funnyYCode;
2638 uint8_t *funnyUVCode= c->funnyUVCode;
2639 uint8_t *formatConvBuffer= c->formatConvBuffer;
2640 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2641 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2642 int lastDstY;
2643
2644 /* vars whch will change and which we need to storw back in the context */
2645 int dstY= c->dstY;
2646 int lumBufIndex= c->lumBufIndex;
2647 int chrBufIndex= c->chrBufIndex;
2648 int lastInLumBuf= c->lastInLumBuf;
2649 int lastInChrBuf= c->lastInChrBuf;
2650
2651 if(isPacked(c->srcFormat)){
2652 src[0]=
2653 src[1]=
2654 src[2]= src[0];
2655 srcStride[0]=
2656 srcStride[1]=
2657 srcStride[2]= srcStride[0];
2658 }
2659 srcStride[1]<<= c->vChrDrop;
2660 srcStride[2]<<= c->vChrDrop;
2661
2662 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2663 // (int)dst[0], (int)dst[1], (int)dst[2]);
2664
2665 #if 0 //self test FIXME move to a vfilter or something
2666 {
2667 static volatile int i=0;
2668 i++;
2669 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2670 selfTest(src, srcStride, c->srcW, c->srcH);
2671 i--;
2672 }
2673 #endif
2674
2675 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2676 //dstStride[0],dstStride[1],dstStride[2]);
2677
2678 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2679 {
2680 static int firstTime=1; //FIXME move this into the context perhaps
2681 if(flags & SWS_PRINT_INFO && firstTime)
2682 {
2683 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2684 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2685 firstTime=0;
2686 }
2687 }
2688
2689 /* Note the user might start scaling the picture in the middle so this will not get executed
2690 this is not really intended but works currently, so ppl might do it */
2691 if(srcSliceY ==0){
2692 lumBufIndex=0;
2693 chrBufIndex=0;
2694 dstY=0;
2695 lastInLumBuf= -1;
2696 lastInChrBuf= -1;
2697 }
2698
2699 lastDstY= dstY;
2700
2701 for(;dstY < dstH; dstY++){
2702 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2703 const int chrDstY= dstY>>c->chrDstVSubSample;
2704 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2705 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2706
2707 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2708 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2709 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2710 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2711
2712 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2713 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2714 //handle holes (FAST_BILINEAR & weird filters)
2715 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2716 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2717 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2718 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2719 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2720
2721 // Do we have enough lines in this slice to output the dstY line
2722 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2723 {
2724 //Do horizontal scaling
2725 while(lastInLumBuf < lastLumSrcY)
2726 {
2727 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2728 lumBufIndex++;
2729 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2730 ASSERT(lumBufIndex < 2*vLumBufSize)
2731 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2732 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2733 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2734 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2735 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2736 funnyYCode, c->srcFormat, formatConvBuffer,
2737 c->lumMmx2Filter, c->lumMmx2FilterPos);
2738 lastInLumBuf++;
2739 }
2740 while(lastInChrBuf < lastChrSrcY)
2741 {
2742 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2743 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2744 chrBufIndex++;
2745 ASSERT(chrBufIndex < 2*vChrBufSize)
2746 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2747 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2748 //FIXME replace parameters through context struct (some at least)
2749
2750 if(!(isGray(srcFormat) || isGray(dstFormat)))
2751 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2752 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2753 funnyUVCode, c->srcFormat, formatConvBuffer,
2754 c->chrMmx2Filter, c->chrMmx2FilterPos);
2755 lastInChrBuf++;
2756 }
2757 //wrap buf index around to stay inside the ring buffer
2758 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2759 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2760 }
2761 else // not enough lines left in this slice -> load the rest in the buffer
2762 {
2763 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2764 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2765 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2766 vChrBufSize, vLumBufSize);*/
2767
2768 //Do horizontal scaling
2769 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2770 {
2771 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2772 lumBufIndex++;
2773 ASSERT(lumBufIndex < 2*vLumBufSize)
2774 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2775 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2776 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2777 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2778 funnyYCode, c->srcFormat, formatConvBuffer,
2779 c->lumMmx2Filter, c->lumMmx2FilterPos);
2780 lastInLumBuf++;
2781 }
2782 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2783 {
2784 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2785 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2786 chrBufIndex++;
2787 ASSERT(chrBufIndex < 2*vChrBufSize)
2788 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2789 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2790
2791 if(!(isGray(srcFormat) || isGray(dstFormat)))
2792 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2793 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2794 funnyUVCode, c->srcFormat, formatConvBuffer,
2795 c->chrMmx2Filter, c->chrMmx2FilterPos);
2796 lastInChrBuf++;
2797 }
2798 //wrap buf index around to stay inside the ring buffer
2799 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2800 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2801 break; //we can't output a dstY line so let's try with the next slice
2802 }
2803
2804 #ifdef HAVE_MMX
2805 b5Dither= dither8[dstY&1];
2806 g6Dither= dither4[dstY&1];
2807 g5Dither= dither8[dstY&1];
2808 r5Dither= dither8[(dstY+1)&1];
2809 #endif
2810 if(dstY < dstH-2)
2811 {
2812 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2813 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2814 #ifdef HAVE_MMX
2815 int i;
2816 for(i=0; i<vLumFilterSize; i++)
2817 {
2818 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2819 lumMmxFilter[4*i+2]=
2820 lumMmxFilter[4*i+3]=
2821 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2822 }
2823 for(i=0; i<vChrFilterSize; i++)
2824 {
2825 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2826 chrMmxFilter[4*i+2]=
2827 chrMmxFilter[4*i+3]=
2828 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2829 }
2830 #endif
2831 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2832 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2833 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2834 RENAME(yuv2nv12X)(c,
2835 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2836 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2837 dest, uDest, dstW, chrDstW, dstFormat);
2838 }
2839 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2840 {
2841 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2842 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2843 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2844 {
2845 int16_t *lumBuf = lumPixBuf[0];
2846 int16_t *chrBuf= chrPixBuf[0];
2847 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2848 }
2849 else //General YV12
2850 {
2851 RENAME(yuv2yuvX)(c,
2852 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2853 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2854 dest, uDest, vDest, dstW, chrDstW);
2855 }
2856 }
2857 else
2858 {
2859 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2860 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2861 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2862 {
2863 int chrAlpha= vChrFilter[2*dstY+1];
2864 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2865 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2866 }
2867 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2868 {
2869 int lumAlpha= vLumFilter[2*dstY+1];
2870 int chrAlpha= vChrFilter[2*dstY+1];
2871 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2872 dest, dstW, lumAlpha, chrAlpha, dstY);
2873 }
2874 else //General RGB
2875 {
2876 RENAME(yuv2packedX)(c,
2877 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2878 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2879 dest, dstW, dstY);
2880 }
2881 }
2882 }
2883 else // hmm looks like we can't use MMX here without overwriting this array's tail
2884 {
2885 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2886 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2887 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2888 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2889 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2890 yuv2nv12XinC(
2891 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2892 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2893 dest, uDest, dstW, chrDstW, dstFormat);
2894 }
2895 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2896 {
2897 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2898 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2899 yuv2yuvXinC(
2900 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2901 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2902 dest, uDest, vDest, dstW, chrDstW);
2903 }
2904 else
2905 {
2906 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2907 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2908 yuv2packedXinC(c,
2909 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2910 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2911 dest, dstW, dstY);
2912 }
2913 }
2914 }
2915
2916 #ifdef HAVE_MMX
2917 __asm __volatile(SFENCE:::"memory");
2918 __asm __volatile(EMMS:::"memory");
2919 #endif
2920 /* store changed local vars back in the context */
2921 c->dstY= dstY;
2922 c->lumBufIndex= lumBufIndex;
2923 c->chrBufIndex= chrBufIndex;
2924 c->lastInLumBuf= lastInLumBuf;
2925 c->lastInChrBuf= lastInChrBuf;
2926
2927 return dstY - lastDstY;
2928 }