Mercurial > mplayer.hg
comparison libswscale/swscale_template.c @ 18861:8579acff875e
Move postproc ---> libswscale
author | lucabe |
---|---|
date | Fri, 30 Jun 2006 12:00:31 +0000 |
parents | |
children | bae6c99a99cc |
comparison
equal
deleted
inserted
replaced
18860:ef741a3e90f5 | 18861:8579acff875e |
---|---|
1 /* | |
2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> | |
3 | |
4 This program is free software; you can redistribute it and/or modify | |
5 it under the terms of the GNU General Public License as published by | |
6 the Free Software Foundation; either version 2 of the License, or | |
7 (at your option) any later version. | |
8 | |
9 This program is distributed in the hope that it will be useful, | |
10 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 GNU General Public License for more details. | |
13 | |
14 You should have received a copy of the GNU General Public License | |
15 along with this program; if not, write to the Free Software | |
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | |
17 */ | |
18 | |
19 #include "asmalign.h" | |
20 | |
21 #undef REAL_MOVNTQ | |
22 #undef MOVNTQ | |
23 #undef PAVGB | |
24 #undef PREFETCH | |
25 #undef PREFETCHW | |
26 #undef EMMS | |
27 #undef SFENCE | |
28 | |
29 #ifdef HAVE_3DNOW | |
30 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
31 #define EMMS "femms" | |
32 #else | |
33 #define EMMS "emms" | |
34 #endif | |
35 | |
36 #ifdef HAVE_3DNOW | |
37 #define PREFETCH "prefetch" | |
38 #define PREFETCHW "prefetchw" | |
39 #elif defined ( HAVE_MMX2 ) | |
40 #define PREFETCH "prefetchnta" | |
41 #define PREFETCHW "prefetcht0" | |
42 #else | |
43 #define PREFETCH "/nop" | |
44 #define PREFETCHW "/nop" | |
45 #endif | |
46 | |
47 #ifdef HAVE_MMX2 | |
48 #define SFENCE "sfence" | |
49 #else | |
50 #define SFENCE "/nop" | |
51 #endif | |
52 | |
53 #ifdef HAVE_MMX2 | |
54 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | |
55 #elif defined (HAVE_3DNOW) | |
56 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | |
57 #endif | |
58 | |
59 #ifdef HAVE_MMX2 | |
60 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" | |
61 #else | |
62 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" | |
63 #endif | |
64 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) | |
65 | |
66 #ifdef HAVE_ALTIVEC | |
67 #include "swscale_altivec_template.c" | |
68 #endif | |
69 | |
70 #define YSCALEYUV2YV12X(x, offset) \ | |
71 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
72 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ | |
73 "movq %%mm3, %%mm4 \n\t"\ | |
74 "lea " offset "(%0), %%"REG_d" \n\t"\ | |
75 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
76 ASMALIGN16 /* FIXME Unroll? */\ | |
77 "1: \n\t"\ | |
78 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
79 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\ | |
80 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\ | |
81 "add $16, %%"REG_d" \n\t"\ | |
82 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
83 "test %%"REG_S", %%"REG_S" \n\t"\ | |
84 "pmulhw %%mm0, %%mm2 \n\t"\ | |
85 "pmulhw %%mm0, %%mm5 \n\t"\ | |
86 "paddw %%mm2, %%mm3 \n\t"\ | |
87 "paddw %%mm5, %%mm4 \n\t"\ | |
88 " jnz 1b \n\t"\ | |
89 "psraw $3, %%mm3 \n\t"\ | |
90 "psraw $3, %%mm4 \n\t"\ | |
91 "packuswb %%mm4, %%mm3 \n\t"\ | |
92 MOVNTQ(%%mm3, (%1, %%REGa))\ | |
93 "add $8, %%"REG_a" \n\t"\ | |
94 "cmp %2, %%"REG_a" \n\t"\ | |
95 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ | |
96 "movq %%mm3, %%mm4 \n\t"\ | |
97 "lea " offset "(%0), %%"REG_d" \n\t"\ | |
98 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
99 "jb 1b \n\t" | |
100 | |
101 #define YSCALEYUV2YV121 \ | |
102 "mov %2, %%"REG_a" \n\t"\ | |
103 ASMALIGN16 /* FIXME Unroll? */\ | |
104 "1: \n\t"\ | |
105 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ | |
106 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\ | |
107 "psraw $7, %%mm0 \n\t"\ | |
108 "psraw $7, %%mm1 \n\t"\ | |
109 "packuswb %%mm1, %%mm0 \n\t"\ | |
110 MOVNTQ(%%mm0, (%1, %%REGa))\ | |
111 "add $8, %%"REG_a" \n\t"\ | |
112 "jnc 1b \n\t" | |
113 | |
114 /* | |
115 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
116 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
117 "r" (dest), "m" (dstW), | |
118 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
119 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
120 */ | |
121 #define YSCALEYUV2PACKEDX \ | |
122 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
123 ASMALIGN16\ | |
124 "nop \n\t"\ | |
125 "1: \n\t"\ | |
126 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ | |
127 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
128 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ | |
129 "movq %%mm3, %%mm4 \n\t"\ | |
130 ASMALIGN16\ | |
131 "2: \n\t"\ | |
132 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
133 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ | |
134 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ | |
135 "add $16, %%"REG_d" \n\t"\ | |
136 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
137 "pmulhw %%mm0, %%mm2 \n\t"\ | |
138 "pmulhw %%mm0, %%mm5 \n\t"\ | |
139 "paddw %%mm2, %%mm3 \n\t"\ | |
140 "paddw %%mm5, %%mm4 \n\t"\ | |
141 "test %%"REG_S", %%"REG_S" \n\t"\ | |
142 " jnz 2b \n\t"\ | |
143 \ | |
144 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ | |
145 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
146 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\ | |
147 "movq %%mm1, %%mm7 \n\t"\ | |
148 ASMALIGN16\ | |
149 "2: \n\t"\ | |
150 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
151 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\ | |
152 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\ | |
153 "add $16, %%"REG_d" \n\t"\ | |
154 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
155 "pmulhw %%mm0, %%mm2 \n\t"\ | |
156 "pmulhw %%mm0, %%mm5 \n\t"\ | |
157 "paddw %%mm2, %%mm1 \n\t"\ | |
158 "paddw %%mm5, %%mm7 \n\t"\ | |
159 "test %%"REG_S", %%"REG_S" \n\t"\ | |
160 " jnz 2b \n\t"\ | |
161 | |
162 | |
163 #define YSCALEYUV2RGBX \ | |
164 YSCALEYUV2PACKEDX\ | |
165 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ | |
166 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ | |
167 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
168 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
169 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ | |
170 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ | |
171 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
172 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ | |
173 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ | |
174 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ | |
175 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ | |
176 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ | |
177 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ | |
178 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
179 "paddw %%mm3, %%mm4 \n\t"\ | |
180 "movq %%mm2, %%mm0 \n\t"\ | |
181 "movq %%mm5, %%mm6 \n\t"\ | |
182 "movq %%mm4, %%mm3 \n\t"\ | |
183 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
184 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
185 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
186 "paddw %%mm1, %%mm2 \n\t"\ | |
187 "paddw %%mm1, %%mm5 \n\t"\ | |
188 "paddw %%mm1, %%mm4 \n\t"\ | |
189 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
190 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
191 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
192 "paddw %%mm7, %%mm0 \n\t"\ | |
193 "paddw %%mm7, %%mm6 \n\t"\ | |
194 "paddw %%mm7, %%mm3 \n\t"\ | |
195 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
196 "packuswb %%mm0, %%mm2 \n\t"\ | |
197 "packuswb %%mm6, %%mm5 \n\t"\ | |
198 "packuswb %%mm3, %%mm4 \n\t"\ | |
199 "pxor %%mm7, %%mm7 \n\t" | |
200 #if 0 | |
201 #define FULL_YSCALEYUV2RGB \ | |
202 "pxor %%mm7, %%mm7 \n\t"\ | |
203 "movd %6, %%mm6 \n\t" /*yalpha1*/\ | |
204 "punpcklwd %%mm6, %%mm6 \n\t"\ | |
205 "punpcklwd %%mm6, %%mm6 \n\t"\ | |
206 "movd %7, %%mm5 \n\t" /*uvalpha1*/\ | |
207 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
208 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
209 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
210 ASMALIGN16\ | |
211 "1: \n\t"\ | |
212 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
213 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
214 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
215 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
216 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
217 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
218 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
219 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
220 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
221 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
222 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
223 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
224 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |
225 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
226 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
227 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\ | |
228 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\ | |
229 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\ | |
230 \ | |
231 \ | |
232 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
233 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
234 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\ | |
235 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
236 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\ | |
237 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
238 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\ | |
239 \ | |
240 \ | |
241 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | |
242 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\ | |
243 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\ | |
244 "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |
245 "paddw %%mm1, %%mm0 \n\t" /* R*/\ | |
246 "packuswb %%mm3, %%mm3 \n\t"\ | |
247 \ | |
248 "packuswb %%mm0, %%mm0 \n\t"\ | |
249 "paddw %%mm4, %%mm2 \n\t"\ | |
250 "paddw %%mm2, %%mm1 \n\t" /* G*/\ | |
251 \ | |
252 "packuswb %%mm1, %%mm1 \n\t" | |
253 #endif | |
254 | |
255 #define REAL_YSCALEYUV2PACKED(index, c) \ | |
256 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ | |
257 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\ | |
258 "psraw $3, %%mm0 \n\t"\ | |
259 "psraw $3, %%mm1 \n\t"\ | |
260 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\ | |
261 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\ | |
262 "xor "#index", "#index" \n\t"\ | |
263 ASMALIGN16\ | |
264 "1: \n\t"\ | |
265 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
266 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
267 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
268 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
269 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
270 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
271 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ | |
272 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
273 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
274 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
275 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
276 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
277 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
278 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
279 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
280 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ | |
281 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ | |
282 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
283 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
284 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
285 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
286 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
287 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
288 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
289 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
290 | |
291 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) | |
292 | |
293 #define REAL_YSCALEYUV2RGB(index, c) \ | |
294 "xor "#index", "#index" \n\t"\ | |
295 ASMALIGN16\ | |
296 "1: \n\t"\ | |
297 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
298 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
299 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\ | |
300 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\ | |
301 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
302 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
303 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ | |
304 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
305 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
306 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
307 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
308 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
309 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
310 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
311 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
312 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
313 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
314 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ | |
315 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
316 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
317 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
318 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
319 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\ | |
320 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\ | |
321 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
322 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
323 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
324 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
325 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
326 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
327 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
328 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
329 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ | |
330 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
331 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
332 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
333 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
334 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
335 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
336 "paddw %%mm3, %%mm4 \n\t"\ | |
337 "movq %%mm2, %%mm0 \n\t"\ | |
338 "movq %%mm5, %%mm6 \n\t"\ | |
339 "movq %%mm4, %%mm3 \n\t"\ | |
340 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
341 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
342 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
343 "paddw %%mm1, %%mm2 \n\t"\ | |
344 "paddw %%mm1, %%mm5 \n\t"\ | |
345 "paddw %%mm1, %%mm4 \n\t"\ | |
346 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
347 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
348 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
349 "paddw %%mm7, %%mm0 \n\t"\ | |
350 "paddw %%mm7, %%mm6 \n\t"\ | |
351 "paddw %%mm7, %%mm3 \n\t"\ | |
352 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
353 "packuswb %%mm0, %%mm2 \n\t"\ | |
354 "packuswb %%mm6, %%mm5 \n\t"\ | |
355 "packuswb %%mm3, %%mm4 \n\t"\ | |
356 "pxor %%mm7, %%mm7 \n\t" | |
357 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c) | |
358 | |
359 #define REAL_YSCALEYUV2PACKED1(index, c) \ | |
360 "xor "#index", "#index" \n\t"\ | |
361 ASMALIGN16\ | |
362 "1: \n\t"\ | |
363 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ | |
364 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
365 "psraw $7, %%mm3 \n\t" \ | |
366 "psraw $7, %%mm4 \n\t" \ | |
367 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
368 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
369 "psraw $7, %%mm1 \n\t" \ | |
370 "psraw $7, %%mm7 \n\t" \ | |
371 | |
372 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) | |
373 | |
374 #define REAL_YSCALEYUV2RGB1(index, c) \ | |
375 "xor "#index", "#index" \n\t"\ | |
376 ASMALIGN16\ | |
377 "1: \n\t"\ | |
378 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ | |
379 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
380 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
381 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
382 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
383 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
384 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
385 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
386 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ | |
387 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
388 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
389 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
390 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
391 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
392 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
393 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ | |
394 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
395 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
396 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
397 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
398 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
399 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
400 "paddw %%mm3, %%mm4 \n\t"\ | |
401 "movq %%mm2, %%mm0 \n\t"\ | |
402 "movq %%mm5, %%mm6 \n\t"\ | |
403 "movq %%mm4, %%mm3 \n\t"\ | |
404 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
405 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
406 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
407 "paddw %%mm1, %%mm2 \n\t"\ | |
408 "paddw %%mm1, %%mm5 \n\t"\ | |
409 "paddw %%mm1, %%mm4 \n\t"\ | |
410 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
411 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
412 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
413 "paddw %%mm7, %%mm0 \n\t"\ | |
414 "paddw %%mm7, %%mm6 \n\t"\ | |
415 "paddw %%mm7, %%mm3 \n\t"\ | |
416 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
417 "packuswb %%mm0, %%mm2 \n\t"\ | |
418 "packuswb %%mm6, %%mm5 \n\t"\ | |
419 "packuswb %%mm3, %%mm4 \n\t"\ | |
420 "pxor %%mm7, %%mm7 \n\t" | |
421 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) | |
422 | |
423 #define REAL_YSCALEYUV2PACKED1b(index, c) \ | |
424 "xor "#index", "#index" \n\t"\ | |
425 ASMALIGN16\ | |
426 "1: \n\t"\ | |
427 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
428 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
429 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
430 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
431 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ | |
432 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
433 "psrlw $8, %%mm3 \n\t" \ | |
434 "psrlw $8, %%mm4 \n\t" \ | |
435 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
436 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
437 "psraw $7, %%mm1 \n\t" \ | |
438 "psraw $7, %%mm7 \n\t" | |
439 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) | |
440 | |
441 // do vertical chrominance interpolation | |
442 #define REAL_YSCALEYUV2RGB1b(index, c) \ | |
443 "xor "#index", "#index" \n\t"\ | |
444 ASMALIGN16\ | |
445 "1: \n\t"\ | |
446 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
447 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
448 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
449 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
450 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ | |
451 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
452 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ | |
453 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ | |
454 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
455 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
456 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
457 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
458 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ | |
459 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
460 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
461 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
462 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
463 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
464 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
465 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ | |
466 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
467 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
468 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
469 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
470 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
471 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
472 "paddw %%mm3, %%mm4 \n\t"\ | |
473 "movq %%mm2, %%mm0 \n\t"\ | |
474 "movq %%mm5, %%mm6 \n\t"\ | |
475 "movq %%mm4, %%mm3 \n\t"\ | |
476 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
477 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
478 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
479 "paddw %%mm1, %%mm2 \n\t"\ | |
480 "paddw %%mm1, %%mm5 \n\t"\ | |
481 "paddw %%mm1, %%mm4 \n\t"\ | |
482 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
483 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
484 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
485 "paddw %%mm7, %%mm0 \n\t"\ | |
486 "paddw %%mm7, %%mm6 \n\t"\ | |
487 "paddw %%mm7, %%mm3 \n\t"\ | |
488 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
489 "packuswb %%mm0, %%mm2 \n\t"\ | |
490 "packuswb %%mm6, %%mm5 \n\t"\ | |
491 "packuswb %%mm3, %%mm4 \n\t"\ | |
492 "pxor %%mm7, %%mm7 \n\t" | |
493 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) | |
494 | |
495 #define REAL_WRITEBGR32(dst, dstw, index) \ | |
496 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
497 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
498 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
499 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
500 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
501 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
502 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
503 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
504 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
505 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
506 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
507 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
508 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
509 \ | |
510 MOVNTQ(%%mm0, (dst, index, 4))\ | |
511 MOVNTQ(%%mm2, 8(dst, index, 4))\ | |
512 MOVNTQ(%%mm1, 16(dst, index, 4))\ | |
513 MOVNTQ(%%mm3, 24(dst, index, 4))\ | |
514 \ | |
515 "add $8, "#index" \n\t"\ | |
516 "cmp "#dstw", "#index" \n\t"\ | |
517 " jb 1b \n\t" | |
518 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index) | |
519 | |
520 #define REAL_WRITEBGR16(dst, dstw, index) \ | |
521 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ | |
522 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ | |
523 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
524 "psrlq $3, %%mm2 \n\t"\ | |
525 \ | |
526 "movq %%mm2, %%mm1 \n\t"\ | |
527 "movq %%mm4, %%mm3 \n\t"\ | |
528 \ | |
529 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
530 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
531 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
532 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
533 \ | |
534 "psllq $3, %%mm3 \n\t"\ | |
535 "psllq $3, %%mm4 \n\t"\ | |
536 \ | |
537 "por %%mm3, %%mm2 \n\t"\ | |
538 "por %%mm4, %%mm1 \n\t"\ | |
539 \ | |
540 MOVNTQ(%%mm2, (dst, index, 2))\ | |
541 MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
542 \ | |
543 "add $8, "#index" \n\t"\ | |
544 "cmp "#dstw", "#index" \n\t"\ | |
545 " jb 1b \n\t" | |
546 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index) | |
547 | |
548 #define REAL_WRITEBGR15(dst, dstw, index) \ | |
549 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ | |
550 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ | |
551 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
552 "psrlq $3, %%mm2 \n\t"\ | |
553 "psrlq $1, %%mm5 \n\t"\ | |
554 \ | |
555 "movq %%mm2, %%mm1 \n\t"\ | |
556 "movq %%mm4, %%mm3 \n\t"\ | |
557 \ | |
558 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
559 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
560 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
561 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
562 \ | |
563 "psllq $2, %%mm3 \n\t"\ | |
564 "psllq $2, %%mm4 \n\t"\ | |
565 \ | |
566 "por %%mm3, %%mm2 \n\t"\ | |
567 "por %%mm4, %%mm1 \n\t"\ | |
568 \ | |
569 MOVNTQ(%%mm2, (dst, index, 2))\ | |
570 MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
571 \ | |
572 "add $8, "#index" \n\t"\ | |
573 "cmp "#dstw", "#index" \n\t"\ | |
574 " jb 1b \n\t" | |
575 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index) | |
576 | |
577 #define WRITEBGR24OLD(dst, dstw, index) \ | |
578 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
579 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
580 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
581 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
582 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
583 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
584 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
585 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
586 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
587 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
588 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
589 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
590 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
591 \ | |
592 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
593 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ | |
594 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\ | |
595 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\ | |
596 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
597 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ | |
598 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ | |
599 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
600 \ | |
601 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
602 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ | |
603 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ | |
604 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ | |
605 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\ | |
606 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ | |
607 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ | |
608 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\ | |
609 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\ | |
610 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ | |
611 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ | |
612 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ | |
613 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ | |
614 \ | |
615 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ | |
616 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ | |
617 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ | |
618 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\ | |
619 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\ | |
620 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ | |
621 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
622 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ | |
623 \ | |
624 MOVNTQ(%%mm0, (dst))\ | |
625 MOVNTQ(%%mm2, 8(dst))\ | |
626 MOVNTQ(%%mm3, 16(dst))\ | |
627 "add $24, "#dst" \n\t"\ | |
628 \ | |
629 "add $8, "#index" \n\t"\ | |
630 "cmp "#dstw", "#index" \n\t"\ | |
631 " jb 1b \n\t" | |
632 | |
633 #define WRITEBGR24MMX(dst, dstw, index) \ | |
634 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
635 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
636 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
637 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
638 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
639 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
640 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
641 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
642 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
643 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
644 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
645 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
646 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
647 \ | |
648 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
649 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ | |
650 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ | |
651 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ | |
652 \ | |
653 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ | |
654 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ | |
655 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ | |
656 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ | |
657 \ | |
658 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ | |
659 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ | |
660 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ | |
661 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ | |
662 \ | |
663 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
664 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ | |
665 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ | |
666 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
667 MOVNTQ(%%mm0, (dst))\ | |
668 \ | |
669 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ | |
670 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ | |
671 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ | |
672 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ | |
673 MOVNTQ(%%mm6, 8(dst))\ | |
674 \ | |
675 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ | |
676 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
677 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ | |
678 MOVNTQ(%%mm5, 16(dst))\ | |
679 \ | |
680 "add $24, "#dst" \n\t"\ | |
681 \ | |
682 "add $8, "#index" \n\t"\ | |
683 "cmp "#dstw", "#index" \n\t"\ | |
684 " jb 1b \n\t" | |
685 | |
686 #define WRITEBGR24MMX2(dst, dstw, index) \ | |
687 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
688 "movq "MANGLE(M24A)", %%mm0 \n\t"\ | |
689 "movq "MANGLE(M24C)", %%mm7 \n\t"\ | |
690 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ | |
691 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ | |
692 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ | |
693 \ | |
694 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ | |
695 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ | |
696 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ | |
697 \ | |
698 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ | |
699 "por %%mm1, %%mm6 \n\t"\ | |
700 "por %%mm3, %%mm6 \n\t"\ | |
701 MOVNTQ(%%mm6, (dst))\ | |
702 \ | |
703 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ | |
704 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ | |
705 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ | |
706 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ | |
707 \ | |
708 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ | |
709 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ | |
710 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ | |
711 \ | |
712 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ | |
713 "por %%mm3, %%mm6 \n\t"\ | |
714 MOVNTQ(%%mm6, 8(dst))\ | |
715 \ | |
716 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ | |
717 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ | |
718 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ | |
719 \ | |
720 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ | |
721 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ | |
722 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ | |
723 \ | |
724 "por %%mm1, %%mm3 \n\t"\ | |
725 "por %%mm3, %%mm6 \n\t"\ | |
726 MOVNTQ(%%mm6, 16(dst))\ | |
727 \ | |
728 "add $24, "#dst" \n\t"\ | |
729 \ | |
730 "add $8, "#index" \n\t"\ | |
731 "cmp "#dstw", "#index" \n\t"\ | |
732 " jb 1b \n\t" | |
733 | |
734 #ifdef HAVE_MMX2 | |
735 #undef WRITEBGR24 | |
736 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index) | |
737 #else | |
738 #undef WRITEBGR24 | |
739 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) | |
740 #endif | |
741 | |
742 #define REAL_WRITEYUY2(dst, dstw, index) \ | |
743 "packuswb %%mm3, %%mm3 \n\t"\ | |
744 "packuswb %%mm4, %%mm4 \n\t"\ | |
745 "packuswb %%mm7, %%mm1 \n\t"\ | |
746 "punpcklbw %%mm4, %%mm3 \n\t"\ | |
747 "movq %%mm1, %%mm7 \n\t"\ | |
748 "punpcklbw %%mm3, %%mm1 \n\t"\ | |
749 "punpckhbw %%mm3, %%mm7 \n\t"\ | |
750 \ | |
751 MOVNTQ(%%mm1, (dst, index, 2))\ | |
752 MOVNTQ(%%mm7, 8(dst, index, 2))\ | |
753 \ | |
754 "add $8, "#index" \n\t"\ | |
755 "cmp "#dstw", "#index" \n\t"\ | |
756 " jb 1b \n\t" | |
757 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) | |
758 | |
759 | |
760 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | |
761 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
762 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) | |
763 { | |
764 #ifdef HAVE_MMX | |
765 if(uDest != NULL) | |
766 { | |
767 asm volatile( | |
768 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET) | |
769 :: "r" (&c->redDither), | |
770 "r" (uDest), "p" (chrDstW) | |
771 : "%"REG_a, "%"REG_d, "%"REG_S | |
772 ); | |
773 | |
774 asm volatile( | |
775 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET) | |
776 :: "r" (&c->redDither), | |
777 "r" (vDest), "p" (chrDstW) | |
778 : "%"REG_a, "%"REG_d, "%"REG_S | |
779 ); | |
780 } | |
781 | |
782 asm volatile( | |
783 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET) | |
784 :: "r" (&c->redDither), | |
785 "r" (dest), "p" (dstW) | |
786 : "%"REG_a, "%"REG_d, "%"REG_S | |
787 ); | |
788 #else | |
789 #ifdef HAVE_ALTIVEC | |
790 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, | |
791 chrFilter, chrSrc, chrFilterSize, | |
792 dest, uDest, vDest, dstW, chrDstW); | |
793 #else //HAVE_ALTIVEC | |
794 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, | |
795 chrFilter, chrSrc, chrFilterSize, | |
796 dest, uDest, vDest, dstW, chrDstW); | |
797 #endif //!HAVE_ALTIVEC | |
798 #endif | |
799 } | |
800 | |
801 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | |
802 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
803 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat) | |
804 { | |
805 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, | |
806 chrFilter, chrSrc, chrFilterSize, | |
807 dest, uDest, dstW, chrDstW, dstFormat); | |
808 } | |
809 | |
810 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc, | |
811 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) | |
812 { | |
813 #ifdef HAVE_MMX | |
814 if(uDest != NULL) | |
815 { | |
816 asm volatile( | |
817 YSCALEYUV2YV121 | |
818 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW), | |
819 "g" (-chrDstW) | |
820 : "%"REG_a | |
821 ); | |
822 | |
823 asm volatile( | |
824 YSCALEYUV2YV121 | |
825 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW), | |
826 "g" (-chrDstW) | |
827 : "%"REG_a | |
828 ); | |
829 } | |
830 | |
831 asm volatile( | |
832 YSCALEYUV2YV121 | |
833 :: "r" (lumSrc + dstW), "r" (dest + dstW), | |
834 "g" (-dstW) | |
835 : "%"REG_a | |
836 ); | |
837 #else | |
838 int i; | |
839 for(i=0; i<dstW; i++) | |
840 { | |
841 int val= lumSrc[i]>>7; | |
842 | |
843 if(val&256){ | |
844 if(val<0) val=0; | |
845 else val=255; | |
846 } | |
847 | |
848 dest[i]= val; | |
849 } | |
850 | |
851 if(uDest != NULL) | |
852 for(i=0; i<chrDstW; i++) | |
853 { | |
854 int u=chrSrc[i]>>7; | |
855 int v=chrSrc[i + 2048]>>7; | |
856 | |
857 if((u|v)&256){ | |
858 if(u<0) u=0; | |
859 else if (u>255) u=255; | |
860 if(v<0) v=0; | |
861 else if (v>255) v=255; | |
862 } | |
863 | |
864 uDest[i]= u; | |
865 vDest[i]= v; | |
866 } | |
867 #endif | |
868 } | |
869 | |
870 | |
871 /** | |
872 * vertical scale YV12 to RGB | |
873 */ | |
874 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | |
875 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
876 uint8_t *dest, long dstW, long dstY) | |
877 { | |
878 long dummy=0; | |
879 switch(c->dstFormat) | |
880 { | |
881 #ifdef HAVE_MMX | |
882 case IMGFMT_BGR32: | |
883 { | |
884 asm volatile( | |
885 YSCALEYUV2RGBX | |
886 WRITEBGR32(%4, %5, %%REGa) | |
887 | |
888 :: "r" (&c->redDither), | |
889 "m" (dummy), "m" (dummy), "m" (dummy), | |
890 "r" (dest), "m" (dstW) | |
891 : "%"REG_a, "%"REG_d, "%"REG_S | |
892 ); | |
893 } | |
894 break; | |
895 case IMGFMT_BGR24: | |
896 { | |
897 asm volatile( | |
898 YSCALEYUV2RGBX | |
899 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize | |
900 "add %4, %%"REG_b" \n\t" | |
901 WRITEBGR24(%%REGb, %5, %%REGa) | |
902 | |
903 :: "r" (&c->redDither), | |
904 "m" (dummy), "m" (dummy), "m" (dummy), | |
905 "r" (dest), "m" (dstW) | |
906 : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx | |
907 ); | |
908 } | |
909 break; | |
910 case IMGFMT_BGR15: | |
911 { | |
912 asm volatile( | |
913 YSCALEYUV2RGBX | |
914 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
915 #ifdef DITHER1XBPP | |
916 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
917 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
918 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
919 #endif | |
920 | |
921 WRITEBGR15(%4, %5, %%REGa) | |
922 | |
923 :: "r" (&c->redDither), | |
924 "m" (dummy), "m" (dummy), "m" (dummy), | |
925 "r" (dest), "m" (dstW) | |
926 : "%"REG_a, "%"REG_d, "%"REG_S | |
927 ); | |
928 } | |
929 break; | |
930 case IMGFMT_BGR16: | |
931 { | |
932 asm volatile( | |
933 YSCALEYUV2RGBX | |
934 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
935 #ifdef DITHER1XBPP | |
936 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
937 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
938 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
939 #endif | |
940 | |
941 WRITEBGR16(%4, %5, %%REGa) | |
942 | |
943 :: "r" (&c->redDither), | |
944 "m" (dummy), "m" (dummy), "m" (dummy), | |
945 "r" (dest), "m" (dstW) | |
946 : "%"REG_a, "%"REG_d, "%"REG_S | |
947 ); | |
948 } | |
949 break; | |
950 case IMGFMT_YUY2: | |
951 { | |
952 asm volatile( | |
953 YSCALEYUV2PACKEDX | |
954 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
955 | |
956 "psraw $3, %%mm3 \n\t" | |
957 "psraw $3, %%mm4 \n\t" | |
958 "psraw $3, %%mm1 \n\t" | |
959 "psraw $3, %%mm7 \n\t" | |
960 WRITEYUY2(%4, %5, %%REGa) | |
961 | |
962 :: "r" (&c->redDither), | |
963 "m" (dummy), "m" (dummy), "m" (dummy), | |
964 "r" (dest), "m" (dstW) | |
965 : "%"REG_a, "%"REG_d, "%"REG_S | |
966 ); | |
967 } | |
968 break; | |
969 #endif | |
970 default: | |
971 #ifdef HAVE_ALTIVEC | |
972 /* The following list of supported dstFormat values should | |
973 match what's found in the body of altivec_yuv2packedX() */ | |
974 if(c->dstFormat==IMGFMT_ABGR || c->dstFormat==IMGFMT_BGRA || | |
975 c->dstFormat==IMGFMT_BGR24 || c->dstFormat==IMGFMT_RGB24 || | |
976 c->dstFormat==IMGFMT_RGBA || c->dstFormat==IMGFMT_ARGB) | |
977 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize, | |
978 chrFilter, chrSrc, chrFilterSize, | |
979 dest, dstW, dstY); | |
980 else | |
981 #endif | |
982 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, | |
983 chrFilter, chrSrc, chrFilterSize, | |
984 dest, dstW, dstY); | |
985 break; | |
986 } | |
987 } | |
988 | |
989 /** | |
990 * vertical bilinear scale YV12 to RGB | |
991 */ | |
992 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, | |
993 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) | |
994 { | |
995 int yalpha1=yalpha^4095; | |
996 int uvalpha1=uvalpha^4095; | |
997 int i; | |
998 | |
999 #if 0 //isn't used | |
1000 if(flags&SWS_FULL_CHR_H_INT) | |
1001 { | |
1002 switch(dstFormat) | |
1003 { | |
1004 #ifdef HAVE_MMX | |
1005 case IMGFMT_BGR32: | |
1006 asm volatile( | |
1007 | |
1008 | |
1009 FULL_YSCALEYUV2RGB | |
1010 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
1011 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
1012 | |
1013 "movq %%mm3, %%mm1 \n\t" | |
1014 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
1015 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
1016 | |
1017 MOVNTQ(%%mm3, (%4, %%REGa, 4)) | |
1018 MOVNTQ(%%mm1, 8(%4, %%REGa, 4)) | |
1019 | |
1020 "add $4, %%"REG_a" \n\t" | |
1021 "cmp %5, %%"REG_a" \n\t" | |
1022 " jb 1b \n\t" | |
1023 | |
1024 | |
1025 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW), | |
1026 "m" (yalpha1), "m" (uvalpha1) | |
1027 : "%"REG_a | |
1028 ); | |
1029 break; | |
1030 case IMGFMT_BGR24: | |
1031 asm volatile( | |
1032 | |
1033 FULL_YSCALEYUV2RGB | |
1034 | |
1035 // lsb ... msb | |
1036 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
1037 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
1038 | |
1039 "movq %%mm3, %%mm1 \n\t" | |
1040 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
1041 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
1042 | |
1043 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0 | |
1044 "psrlq $8, %%mm3 \n\t" // GR0BGR00 | |
1045 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000 | |
1046 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00 | |
1047 "por %%mm2, %%mm3 \n\t" // BGRBGR00 | |
1048 "movq %%mm1, %%mm2 \n\t" | |
1049 "psllq $48, %%mm1 \n\t" // 000000BG | |
1050 "por %%mm1, %%mm3 \n\t" // BGRBGRBG | |
1051 | |
1052 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0 | |
1053 "psrld $16, %%mm2 \n\t" // R000R000 | |
1054 "psrlq $24, %%mm1 \n\t" // 0BGR0000 | |
1055 "por %%mm2, %%mm1 \n\t" // RBGRR000 | |
1056 | |
1057 "mov %4, %%"REG_b" \n\t" | |
1058 "add %%"REG_a", %%"REG_b" \n\t" | |
1059 | |
1060 #ifdef HAVE_MMX2 | |
1061 //FIXME Alignment | |
1062 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t" | |
1063 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t" | |
1064 #else | |
1065 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t" | |
1066 "psrlq $32, %%mm3 \n\t" | |
1067 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t" | |
1068 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t" | |
1069 #endif | |
1070 "add $4, %%"REG_a" \n\t" | |
1071 "cmp %5, %%"REG_a" \n\t" | |
1072 " jb 1b \n\t" | |
1073 | |
1074 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), | |
1075 "m" (yalpha1), "m" (uvalpha1) | |
1076 : "%"REG_a, "%"REG_b | |
1077 ); | |
1078 break; | |
1079 case IMGFMT_BGR15: | |
1080 asm volatile( | |
1081 | |
1082 FULL_YSCALEYUV2RGB | |
1083 #ifdef DITHER1XBPP | |
1084 "paddusb "MANGLE(g5Dither)", %%mm1\n\t" | |
1085 "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
1086 "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
1087 #endif | |
1088 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |
1089 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
1090 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
1091 | |
1092 "psrlw $3, %%mm3 \n\t" | |
1093 "psllw $2, %%mm1 \n\t" | |
1094 "psllw $7, %%mm0 \n\t" | |
1095 "pand "MANGLE(g15Mask)", %%mm1 \n\t" | |
1096 "pand "MANGLE(r15Mask)", %%mm0 \n\t" | |
1097 | |
1098 "por %%mm3, %%mm1 \n\t" | |
1099 "por %%mm1, %%mm0 \n\t" | |
1100 | |
1101 MOVNTQ(%%mm0, (%4, %%REGa, 2)) | |
1102 | |
1103 "add $4, %%"REG_a" \n\t" | |
1104 "cmp %5, %%"REG_a" \n\t" | |
1105 " jb 1b \n\t" | |
1106 | |
1107 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), | |
1108 "m" (yalpha1), "m" (uvalpha1) | |
1109 : "%"REG_a | |
1110 ); | |
1111 break; | |
1112 case IMGFMT_BGR16: | |
1113 asm volatile( | |
1114 | |
1115 FULL_YSCALEYUV2RGB | |
1116 #ifdef DITHER1XBPP | |
1117 "paddusb "MANGLE(g6Dither)", %%mm1\n\t" | |
1118 "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
1119 "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
1120 #endif | |
1121 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |
1122 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
1123 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
1124 | |
1125 "psrlw $3, %%mm3 \n\t" | |
1126 "psllw $3, %%mm1 \n\t" | |
1127 "psllw $8, %%mm0 \n\t" | |
1128 "pand "MANGLE(g16Mask)", %%mm1 \n\t" | |
1129 "pand "MANGLE(r16Mask)", %%mm0 \n\t" | |
1130 | |
1131 "por %%mm3, %%mm1 \n\t" | |
1132 "por %%mm1, %%mm0 \n\t" | |
1133 | |
1134 MOVNTQ(%%mm0, (%4, %%REGa, 2)) | |
1135 | |
1136 "add $4, %%"REG_a" \n\t" | |
1137 "cmp %5, %%"REG_a" \n\t" | |
1138 " jb 1b \n\t" | |
1139 | |
1140 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), | |
1141 "m" (yalpha1), "m" (uvalpha1) | |
1142 : "%"REG_a | |
1143 ); | |
1144 break; | |
1145 #endif | |
1146 case IMGFMT_RGB32: | |
1147 #ifndef HAVE_MMX | |
1148 case IMGFMT_BGR32: | |
1149 #endif | |
1150 if(dstFormat==IMGFMT_BGR32) | |
1151 { | |
1152 int i; | |
1153 #ifdef WORDS_BIGENDIAN | |
1154 dest++; | |
1155 #endif | |
1156 for(i=0;i<dstW;i++){ | |
1157 // vertical linear interpolation && yuv2rgb in a single step: | |
1158 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1159 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1160 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1161 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; | |
1162 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1163 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
1164 dest+= 4; | |
1165 } | |
1166 } | |
1167 else if(dstFormat==IMGFMT_BGR24) | |
1168 { | |
1169 int i; | |
1170 for(i=0;i<dstW;i++){ | |
1171 // vertical linear interpolation && yuv2rgb in a single step: | |
1172 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1173 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1174 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1175 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; | |
1176 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1177 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
1178 dest+= 3; | |
1179 } | |
1180 } | |
1181 else if(dstFormat==IMGFMT_BGR16) | |
1182 { | |
1183 int i; | |
1184 for(i=0;i<dstW;i++){ | |
1185 // vertical linear interpolation && yuv2rgb in a single step: | |
1186 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1187 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1188 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1189 | |
1190 ((uint16_t*)dest)[i] = | |
1191 clip_table16b[(Y + yuvtab_40cf[U]) >>13] | | |
1192 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1193 clip_table16r[(Y + yuvtab_3343[V]) >>13]; | |
1194 } | |
1195 } | |
1196 else if(dstFormat==IMGFMT_BGR15) | |
1197 { | |
1198 int i; | |
1199 for(i=0;i<dstW;i++){ | |
1200 // vertical linear interpolation && yuv2rgb in a single step: | |
1201 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1202 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1203 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1204 | |
1205 ((uint16_t*)dest)[i] = | |
1206 clip_table15b[(Y + yuvtab_40cf[U]) >>13] | | |
1207 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1208 clip_table15r[(Y + yuvtab_3343[V]) >>13]; | |
1209 } | |
1210 } | |
1211 }//FULL_UV_IPOL | |
1212 else | |
1213 { | |
1214 #endif // if 0 | |
1215 #ifdef HAVE_MMX | |
1216 switch(c->dstFormat) | |
1217 { | |
1218 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( | |
1219 case IMGFMT_BGR32: | |
1220 asm volatile( | |
1221 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1222 "mov %4, %%"REG_b" \n\t" | |
1223 "push %%"REG_BP" \n\t" | |
1224 YSCALEYUV2RGB(%%REGBP, %5) | |
1225 WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1226 "pop %%"REG_BP" \n\t" | |
1227 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1228 | |
1229 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1230 "a" (&c->redDither) | |
1231 ); | |
1232 return; | |
1233 case IMGFMT_BGR24: | |
1234 asm volatile( | |
1235 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1236 "mov %4, %%"REG_b" \n\t" | |
1237 "push %%"REG_BP" \n\t" | |
1238 YSCALEYUV2RGB(%%REGBP, %5) | |
1239 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1240 "pop %%"REG_BP" \n\t" | |
1241 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1242 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1243 "a" (&c->redDither) | |
1244 ); | |
1245 return; | |
1246 case IMGFMT_BGR15: | |
1247 asm volatile( | |
1248 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1249 "mov %4, %%"REG_b" \n\t" | |
1250 "push %%"REG_BP" \n\t" | |
1251 YSCALEYUV2RGB(%%REGBP, %5) | |
1252 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1253 #ifdef DITHER1XBPP | |
1254 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1255 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1256 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1257 #endif | |
1258 | |
1259 WRITEBGR15(%%REGb, 8280(%5), %%REGBP) | |
1260 "pop %%"REG_BP" \n\t" | |
1261 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1262 | |
1263 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1264 "a" (&c->redDither) | |
1265 ); | |
1266 return; | |
1267 case IMGFMT_BGR16: | |
1268 asm volatile( | |
1269 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1270 "mov %4, %%"REG_b" \n\t" | |
1271 "push %%"REG_BP" \n\t" | |
1272 YSCALEYUV2RGB(%%REGBP, %5) | |
1273 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1274 #ifdef DITHER1XBPP | |
1275 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1276 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1277 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1278 #endif | |
1279 | |
1280 WRITEBGR16(%%REGb, 8280(%5), %%REGBP) | |
1281 "pop %%"REG_BP" \n\t" | |
1282 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1283 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1284 "a" (&c->redDither) | |
1285 ); | |
1286 return; | |
1287 case IMGFMT_YUY2: | |
1288 asm volatile( | |
1289 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1290 "mov %4, %%"REG_b" \n\t" | |
1291 "push %%"REG_BP" \n\t" | |
1292 YSCALEYUV2PACKED(%%REGBP, %5) | |
1293 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1294 "pop %%"REG_BP" \n\t" | |
1295 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1296 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1297 "a" (&c->redDither) | |
1298 ); | |
1299 return; | |
1300 default: break; | |
1301 } | |
1302 #endif //HAVE_MMX | |
1303 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C) | |
1304 } | |
1305 | |
1306 /** | |
1307 * YV12 to RGB without scaling or interpolating | |
1308 */ | |
1309 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, | |
1310 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y) | |
1311 { | |
1312 const int yalpha1=0; | |
1313 int i; | |
1314 | |
1315 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1 | |
1316 const int yalpha= 4096; //FIXME ... | |
1317 | |
1318 if(flags&SWS_FULL_CHR_H_INT) | |
1319 { | |
1320 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y); | |
1321 return; | |
1322 } | |
1323 | |
1324 #ifdef HAVE_MMX | |
1325 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster | |
1326 { | |
1327 switch(dstFormat) | |
1328 { | |
1329 case IMGFMT_BGR32: | |
1330 asm volatile( | |
1331 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1332 "mov %4, %%"REG_b" \n\t" | |
1333 "push %%"REG_BP" \n\t" | |
1334 YSCALEYUV2RGB1(%%REGBP, %5) | |
1335 WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1336 "pop %%"REG_BP" \n\t" | |
1337 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1338 | |
1339 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1340 "a" (&c->redDither) | |
1341 ); | |
1342 return; | |
1343 case IMGFMT_BGR24: | |
1344 asm volatile( | |
1345 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1346 "mov %4, %%"REG_b" \n\t" | |
1347 "push %%"REG_BP" \n\t" | |
1348 YSCALEYUV2RGB1(%%REGBP, %5) | |
1349 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1350 "pop %%"REG_BP" \n\t" | |
1351 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1352 | |
1353 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1354 "a" (&c->redDither) | |
1355 ); | |
1356 return; | |
1357 case IMGFMT_BGR15: | |
1358 asm volatile( | |
1359 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1360 "mov %4, %%"REG_b" \n\t" | |
1361 "push %%"REG_BP" \n\t" | |
1362 YSCALEYUV2RGB1(%%REGBP, %5) | |
1363 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1364 #ifdef DITHER1XBPP | |
1365 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1366 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1367 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1368 #endif | |
1369 WRITEBGR15(%%REGb, 8280(%5), %%REGBP) | |
1370 "pop %%"REG_BP" \n\t" | |
1371 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1372 | |
1373 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1374 "a" (&c->redDither) | |
1375 ); | |
1376 return; | |
1377 case IMGFMT_BGR16: | |
1378 asm volatile( | |
1379 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1380 "mov %4, %%"REG_b" \n\t" | |
1381 "push %%"REG_BP" \n\t" | |
1382 YSCALEYUV2RGB1(%%REGBP, %5) | |
1383 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1384 #ifdef DITHER1XBPP | |
1385 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1386 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1387 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1388 #endif | |
1389 | |
1390 WRITEBGR16(%%REGb, 8280(%5), %%REGBP) | |
1391 "pop %%"REG_BP" \n\t" | |
1392 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1393 | |
1394 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1395 "a" (&c->redDither) | |
1396 ); | |
1397 return; | |
1398 case IMGFMT_YUY2: | |
1399 asm volatile( | |
1400 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1401 "mov %4, %%"REG_b" \n\t" | |
1402 "push %%"REG_BP" \n\t" | |
1403 YSCALEYUV2PACKED1(%%REGBP, %5) | |
1404 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1405 "pop %%"REG_BP" \n\t" | |
1406 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1407 | |
1408 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1409 "a" (&c->redDither) | |
1410 ); | |
1411 return; | |
1412 } | |
1413 } | |
1414 else | |
1415 { | |
1416 switch(dstFormat) | |
1417 { | |
1418 case IMGFMT_BGR32: | |
1419 asm volatile( | |
1420 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1421 "mov %4, %%"REG_b" \n\t" | |
1422 "push %%"REG_BP" \n\t" | |
1423 YSCALEYUV2RGB1b(%%REGBP, %5) | |
1424 WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1425 "pop %%"REG_BP" \n\t" | |
1426 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1427 | |
1428 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1429 "a" (&c->redDither) | |
1430 ); | |
1431 return; | |
1432 case IMGFMT_BGR24: | |
1433 asm volatile( | |
1434 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1435 "mov %4, %%"REG_b" \n\t" | |
1436 "push %%"REG_BP" \n\t" | |
1437 YSCALEYUV2RGB1b(%%REGBP, %5) | |
1438 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1439 "pop %%"REG_BP" \n\t" | |
1440 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1441 | |
1442 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1443 "a" (&c->redDither) | |
1444 ); | |
1445 return; | |
1446 case IMGFMT_BGR15: | |
1447 asm volatile( | |
1448 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1449 "mov %4, %%"REG_b" \n\t" | |
1450 "push %%"REG_BP" \n\t" | |
1451 YSCALEYUV2RGB1b(%%REGBP, %5) | |
1452 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1453 #ifdef DITHER1XBPP | |
1454 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1455 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1456 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1457 #endif | |
1458 WRITEBGR15(%%REGb, 8280(%5), %%REGBP) | |
1459 "pop %%"REG_BP" \n\t" | |
1460 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1461 | |
1462 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1463 "a" (&c->redDither) | |
1464 ); | |
1465 return; | |
1466 case IMGFMT_BGR16: | |
1467 asm volatile( | |
1468 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1469 "mov %4, %%"REG_b" \n\t" | |
1470 "push %%"REG_BP" \n\t" | |
1471 YSCALEYUV2RGB1b(%%REGBP, %5) | |
1472 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1473 #ifdef DITHER1XBPP | |
1474 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1475 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1476 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1477 #endif | |
1478 | |
1479 WRITEBGR16(%%REGb, 8280(%5), %%REGBP) | |
1480 "pop %%"REG_BP" \n\t" | |
1481 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1482 | |
1483 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1484 "a" (&c->redDither) | |
1485 ); | |
1486 return; | |
1487 case IMGFMT_YUY2: | |
1488 asm volatile( | |
1489 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1490 "mov %4, %%"REG_b" \n\t" | |
1491 "push %%"REG_BP" \n\t" | |
1492 YSCALEYUV2PACKED1b(%%REGBP, %5) | |
1493 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1494 "pop %%"REG_BP" \n\t" | |
1495 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1496 | |
1497 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1498 "a" (&c->redDither) | |
1499 ); | |
1500 return; | |
1501 } | |
1502 } | |
1503 #endif | |
1504 if( uvalpha < 2048 ) | |
1505 { | |
1506 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C) | |
1507 }else{ | |
1508 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C) | |
1509 } | |
1510 } | |
1511 | |
1512 //FIXME yuy2* can read upto 7 samples to much | |
1513 | |
1514 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width) | |
1515 { | |
1516 #ifdef HAVE_MMX | |
1517 asm volatile( | |
1518 "movq "MANGLE(bm01010101)", %%mm2\n\t" | |
1519 "mov %0, %%"REG_a" \n\t" | |
1520 "1: \n\t" | |
1521 "movq (%1, %%"REG_a",2), %%mm0 \n\t" | |
1522 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |
1523 "pand %%mm2, %%mm0 \n\t" | |
1524 "pand %%mm2, %%mm1 \n\t" | |
1525 "packuswb %%mm1, %%mm0 \n\t" | |
1526 "movq %%mm0, (%2, %%"REG_a") \n\t" | |
1527 "add $8, %%"REG_a" \n\t" | |
1528 " js 1b \n\t" | |
1529 : : "g" (-width), "r" (src+width*2), "r" (dst+width) | |
1530 : "%"REG_a | |
1531 ); | |
1532 #else | |
1533 int i; | |
1534 for(i=0; i<width; i++) | |
1535 dst[i]= src[2*i]; | |
1536 #endif | |
1537 } | |
1538 | |
1539 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) | |
1540 { | |
1541 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1542 asm volatile( | |
1543 "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
1544 "mov %0, %%"REG_a" \n\t" | |
1545 "1: \n\t" | |
1546 "movq (%1, %%"REG_a",4), %%mm0 \n\t" | |
1547 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | |
1548 "movq (%2, %%"REG_a",4), %%mm2 \n\t" | |
1549 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t" | |
1550 PAVGB(%%mm2, %%mm0) | |
1551 PAVGB(%%mm3, %%mm1) | |
1552 "psrlw $8, %%mm0 \n\t" | |
1553 "psrlw $8, %%mm1 \n\t" | |
1554 "packuswb %%mm1, %%mm0 \n\t" | |
1555 "movq %%mm0, %%mm1 \n\t" | |
1556 "psrlw $8, %%mm0 \n\t" | |
1557 "pand %%mm4, %%mm1 \n\t" | |
1558 "packuswb %%mm0, %%mm0 \n\t" | |
1559 "packuswb %%mm1, %%mm1 \n\t" | |
1560 "movd %%mm0, (%4, %%"REG_a") \n\t" | |
1561 "movd %%mm1, (%3, %%"REG_a") \n\t" | |
1562 "add $4, %%"REG_a" \n\t" | |
1563 " js 1b \n\t" | |
1564 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width) | |
1565 : "%"REG_a | |
1566 ); | |
1567 #else | |
1568 int i; | |
1569 for(i=0; i<width; i++) | |
1570 { | |
1571 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1; | |
1572 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1; | |
1573 } | |
1574 #endif | |
1575 } | |
1576 | |
1577 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses | |
1578 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width) | |
1579 { | |
1580 #ifdef HAVE_MMX | |
1581 asm volatile( | |
1582 "mov %0, %%"REG_a" \n\t" | |
1583 "1: \n\t" | |
1584 "movq (%1, %%"REG_a",2), %%mm0 \n\t" | |
1585 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |
1586 "psrlw $8, %%mm0 \n\t" | |
1587 "psrlw $8, %%mm1 \n\t" | |
1588 "packuswb %%mm1, %%mm0 \n\t" | |
1589 "movq %%mm0, (%2, %%"REG_a") \n\t" | |
1590 "add $8, %%"REG_a" \n\t" | |
1591 " js 1b \n\t" | |
1592 : : "g" (-width), "r" (src+width*2), "r" (dst+width) | |
1593 : "%"REG_a | |
1594 ); | |
1595 #else | |
1596 int i; | |
1597 for(i=0; i<width; i++) | |
1598 dst[i]= src[2*i+1]; | |
1599 #endif | |
1600 } | |
1601 | |
1602 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) | |
1603 { | |
1604 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1605 asm volatile( | |
1606 "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
1607 "mov %0, %%"REG_a" \n\t" | |
1608 "1: \n\t" | |
1609 "movq (%1, %%"REG_a",4), %%mm0 \n\t" | |
1610 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | |
1611 "movq (%2, %%"REG_a",4), %%mm2 \n\t" | |
1612 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t" | |
1613 PAVGB(%%mm2, %%mm0) | |
1614 PAVGB(%%mm3, %%mm1) | |
1615 "pand %%mm4, %%mm0 \n\t" | |
1616 "pand %%mm4, %%mm1 \n\t" | |
1617 "packuswb %%mm1, %%mm0 \n\t" | |
1618 "movq %%mm0, %%mm1 \n\t" | |
1619 "psrlw $8, %%mm0 \n\t" | |
1620 "pand %%mm4, %%mm1 \n\t" | |
1621 "packuswb %%mm0, %%mm0 \n\t" | |
1622 "packuswb %%mm1, %%mm1 \n\t" | |
1623 "movd %%mm0, (%4, %%"REG_a") \n\t" | |
1624 "movd %%mm1, (%3, %%"REG_a") \n\t" | |
1625 "add $4, %%"REG_a" \n\t" | |
1626 " js 1b \n\t" | |
1627 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width) | |
1628 : "%"REG_a | |
1629 ); | |
1630 #else | |
1631 int i; | |
1632 for(i=0; i<width; i++) | |
1633 { | |
1634 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1; | |
1635 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1; | |
1636 } | |
1637 #endif | |
1638 } | |
1639 | |
1640 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width) | |
1641 { | |
1642 int i; | |
1643 for(i=0; i<width; i++) | |
1644 { | |
1645 int b= ((uint32_t*)src)[i]&0xFF; | |
1646 int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
1647 int r= (((uint32_t*)src)[i]>>16)&0xFF; | |
1648 | |
1649 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); | |
1650 } | |
1651 } | |
1652 | |
1653 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1654 { | |
1655 int i; | |
1656 for(i=0; i<width; i++) | |
1657 { | |
1658 const int a= ((uint32_t*)src1)[2*i+0]; | |
1659 const int e= ((uint32_t*)src1)[2*i+1]; | |
1660 const int c= ((uint32_t*)src2)[2*i+0]; | |
1661 const int d= ((uint32_t*)src2)[2*i+1]; | |
1662 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF); | |
1663 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00); | |
1664 const int b= l&0x3FF; | |
1665 const int g= h>>8; | |
1666 const int r= l>>16; | |
1667 | |
1668 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1669 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1670 } | |
1671 } | |
1672 | |
1673 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width) | |
1674 { | |
1675 #ifdef HAVE_MMX | |
1676 asm volatile( | |
1677 "mov %2, %%"REG_a" \n\t" | |
1678 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" | |
1679 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
1680 "pxor %%mm7, %%mm7 \n\t" | |
1681 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" | |
1682 ASMALIGN16 | |
1683 "1: \n\t" | |
1684 PREFETCH" 64(%0, %%"REG_b") \n\t" | |
1685 "movd (%0, %%"REG_b"), %%mm0 \n\t" | |
1686 "movd 3(%0, %%"REG_b"), %%mm1 \n\t" | |
1687 "punpcklbw %%mm7, %%mm0 \n\t" | |
1688 "punpcklbw %%mm7, %%mm1 \n\t" | |
1689 "movd 6(%0, %%"REG_b"), %%mm2 \n\t" | |
1690 "movd 9(%0, %%"REG_b"), %%mm3 \n\t" | |
1691 "punpcklbw %%mm7, %%mm2 \n\t" | |
1692 "punpcklbw %%mm7, %%mm3 \n\t" | |
1693 "pmaddwd %%mm6, %%mm0 \n\t" | |
1694 "pmaddwd %%mm6, %%mm1 \n\t" | |
1695 "pmaddwd %%mm6, %%mm2 \n\t" | |
1696 "pmaddwd %%mm6, %%mm3 \n\t" | |
1697 #ifndef FAST_BGR2YV12 | |
1698 "psrad $8, %%mm0 \n\t" | |
1699 "psrad $8, %%mm1 \n\t" | |
1700 "psrad $8, %%mm2 \n\t" | |
1701 "psrad $8, %%mm3 \n\t" | |
1702 #endif | |
1703 "packssdw %%mm1, %%mm0 \n\t" | |
1704 "packssdw %%mm3, %%mm2 \n\t" | |
1705 "pmaddwd %%mm5, %%mm0 \n\t" | |
1706 "pmaddwd %%mm5, %%mm2 \n\t" | |
1707 "packssdw %%mm2, %%mm0 \n\t" | |
1708 "psraw $7, %%mm0 \n\t" | |
1709 | |
1710 "movd 12(%0, %%"REG_b"), %%mm4 \n\t" | |
1711 "movd 15(%0, %%"REG_b"), %%mm1 \n\t" | |
1712 "punpcklbw %%mm7, %%mm4 \n\t" | |
1713 "punpcklbw %%mm7, %%mm1 \n\t" | |
1714 "movd 18(%0, %%"REG_b"), %%mm2 \n\t" | |
1715 "movd 21(%0, %%"REG_b"), %%mm3 \n\t" | |
1716 "punpcklbw %%mm7, %%mm2 \n\t" | |
1717 "punpcklbw %%mm7, %%mm3 \n\t" | |
1718 "pmaddwd %%mm6, %%mm4 \n\t" | |
1719 "pmaddwd %%mm6, %%mm1 \n\t" | |
1720 "pmaddwd %%mm6, %%mm2 \n\t" | |
1721 "pmaddwd %%mm6, %%mm3 \n\t" | |
1722 #ifndef FAST_BGR2YV12 | |
1723 "psrad $8, %%mm4 \n\t" | |
1724 "psrad $8, %%mm1 \n\t" | |
1725 "psrad $8, %%mm2 \n\t" | |
1726 "psrad $8, %%mm3 \n\t" | |
1727 #endif | |
1728 "packssdw %%mm1, %%mm4 \n\t" | |
1729 "packssdw %%mm3, %%mm2 \n\t" | |
1730 "pmaddwd %%mm5, %%mm4 \n\t" | |
1731 "pmaddwd %%mm5, %%mm2 \n\t" | |
1732 "add $24, %%"REG_b" \n\t" | |
1733 "packssdw %%mm2, %%mm4 \n\t" | |
1734 "psraw $7, %%mm4 \n\t" | |
1735 | |
1736 "packuswb %%mm4, %%mm0 \n\t" | |
1737 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" | |
1738 | |
1739 "movq %%mm0, (%1, %%"REG_a") \n\t" | |
1740 "add $8, %%"REG_a" \n\t" | |
1741 " js 1b \n\t" | |
1742 : : "r" (src+width*3), "r" (dst+width), "g" (-width) | |
1743 : "%"REG_a, "%"REG_b | |
1744 ); | |
1745 #else | |
1746 int i; | |
1747 for(i=0; i<width; i++) | |
1748 { | |
1749 int b= src[i*3+0]; | |
1750 int g= src[i*3+1]; | |
1751 int r= src[i*3+2]; | |
1752 | |
1753 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); | |
1754 } | |
1755 #endif | |
1756 } | |
1757 | |
1758 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) | |
1759 { | |
1760 #ifdef HAVE_MMX | |
1761 asm volatile( | |
1762 "mov %4, %%"REG_a" \n\t" | |
1763 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
1764 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" | |
1765 "pxor %%mm7, %%mm7 \n\t" | |
1766 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t" | |
1767 "add %%"REG_b", %%"REG_b" \n\t" | |
1768 ASMALIGN16 | |
1769 "1: \n\t" | |
1770 PREFETCH" 64(%0, %%"REG_b") \n\t" | |
1771 PREFETCH" 64(%1, %%"REG_b") \n\t" | |
1772 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1773 "movq (%0, %%"REG_b"), %%mm0 \n\t" | |
1774 "movq (%1, %%"REG_b"), %%mm1 \n\t" | |
1775 "movq 6(%0, %%"REG_b"), %%mm2 \n\t" | |
1776 "movq 6(%1, %%"REG_b"), %%mm3 \n\t" | |
1777 PAVGB(%%mm1, %%mm0) | |
1778 PAVGB(%%mm3, %%mm2) | |
1779 "movq %%mm0, %%mm1 \n\t" | |
1780 "movq %%mm2, %%mm3 \n\t" | |
1781 "psrlq $24, %%mm0 \n\t" | |
1782 "psrlq $24, %%mm2 \n\t" | |
1783 PAVGB(%%mm1, %%mm0) | |
1784 PAVGB(%%mm3, %%mm2) | |
1785 "punpcklbw %%mm7, %%mm0 \n\t" | |
1786 "punpcklbw %%mm7, %%mm2 \n\t" | |
1787 #else | |
1788 "movd (%0, %%"REG_b"), %%mm0 \n\t" | |
1789 "movd (%1, %%"REG_b"), %%mm1 \n\t" | |
1790 "movd 3(%0, %%"REG_b"), %%mm2 \n\t" | |
1791 "movd 3(%1, %%"REG_b"), %%mm3 \n\t" | |
1792 "punpcklbw %%mm7, %%mm0 \n\t" | |
1793 "punpcklbw %%mm7, %%mm1 \n\t" | |
1794 "punpcklbw %%mm7, %%mm2 \n\t" | |
1795 "punpcklbw %%mm7, %%mm3 \n\t" | |
1796 "paddw %%mm1, %%mm0 \n\t" | |
1797 "paddw %%mm3, %%mm2 \n\t" | |
1798 "paddw %%mm2, %%mm0 \n\t" | |
1799 "movd 6(%0, %%"REG_b"), %%mm4 \n\t" | |
1800 "movd 6(%1, %%"REG_b"), %%mm1 \n\t" | |
1801 "movd 9(%0, %%"REG_b"), %%mm2 \n\t" | |
1802 "movd 9(%1, %%"REG_b"), %%mm3 \n\t" | |
1803 "punpcklbw %%mm7, %%mm4 \n\t" | |
1804 "punpcklbw %%mm7, %%mm1 \n\t" | |
1805 "punpcklbw %%mm7, %%mm2 \n\t" | |
1806 "punpcklbw %%mm7, %%mm3 \n\t" | |
1807 "paddw %%mm1, %%mm4 \n\t" | |
1808 "paddw %%mm3, %%mm2 \n\t" | |
1809 "paddw %%mm4, %%mm2 \n\t" | |
1810 "psrlw $2, %%mm0 \n\t" | |
1811 "psrlw $2, %%mm2 \n\t" | |
1812 #endif | |
1813 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" | |
1814 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
1815 | |
1816 "pmaddwd %%mm0, %%mm1 \n\t" | |
1817 "pmaddwd %%mm2, %%mm3 \n\t" | |
1818 "pmaddwd %%mm6, %%mm0 \n\t" | |
1819 "pmaddwd %%mm6, %%mm2 \n\t" | |
1820 #ifndef FAST_BGR2YV12 | |
1821 "psrad $8, %%mm0 \n\t" | |
1822 "psrad $8, %%mm1 \n\t" | |
1823 "psrad $8, %%mm2 \n\t" | |
1824 "psrad $8, %%mm3 \n\t" | |
1825 #endif | |
1826 "packssdw %%mm2, %%mm0 \n\t" | |
1827 "packssdw %%mm3, %%mm1 \n\t" | |
1828 "pmaddwd %%mm5, %%mm0 \n\t" | |
1829 "pmaddwd %%mm5, %%mm1 \n\t" | |
1830 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
1831 "psraw $7, %%mm0 \n\t" | |
1832 | |
1833 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1834 "movq 12(%0, %%"REG_b"), %%mm4 \n\t" | |
1835 "movq 12(%1, %%"REG_b"), %%mm1 \n\t" | |
1836 "movq 18(%0, %%"REG_b"), %%mm2 \n\t" | |
1837 "movq 18(%1, %%"REG_b"), %%mm3 \n\t" | |
1838 PAVGB(%%mm1, %%mm4) | |
1839 PAVGB(%%mm3, %%mm2) | |
1840 "movq %%mm4, %%mm1 \n\t" | |
1841 "movq %%mm2, %%mm3 \n\t" | |
1842 "psrlq $24, %%mm4 \n\t" | |
1843 "psrlq $24, %%mm2 \n\t" | |
1844 PAVGB(%%mm1, %%mm4) | |
1845 PAVGB(%%mm3, %%mm2) | |
1846 "punpcklbw %%mm7, %%mm4 \n\t" | |
1847 "punpcklbw %%mm7, %%mm2 \n\t" | |
1848 #else | |
1849 "movd 12(%0, %%"REG_b"), %%mm4 \n\t" | |
1850 "movd 12(%1, %%"REG_b"), %%mm1 \n\t" | |
1851 "movd 15(%0, %%"REG_b"), %%mm2 \n\t" | |
1852 "movd 15(%1, %%"REG_b"), %%mm3 \n\t" | |
1853 "punpcklbw %%mm7, %%mm4 \n\t" | |
1854 "punpcklbw %%mm7, %%mm1 \n\t" | |
1855 "punpcklbw %%mm7, %%mm2 \n\t" | |
1856 "punpcklbw %%mm7, %%mm3 \n\t" | |
1857 "paddw %%mm1, %%mm4 \n\t" | |
1858 "paddw %%mm3, %%mm2 \n\t" | |
1859 "paddw %%mm2, %%mm4 \n\t" | |
1860 "movd 18(%0, %%"REG_b"), %%mm5 \n\t" | |
1861 "movd 18(%1, %%"REG_b"), %%mm1 \n\t" | |
1862 "movd 21(%0, %%"REG_b"), %%mm2 \n\t" | |
1863 "movd 21(%1, %%"REG_b"), %%mm3 \n\t" | |
1864 "punpcklbw %%mm7, %%mm5 \n\t" | |
1865 "punpcklbw %%mm7, %%mm1 \n\t" | |
1866 "punpcklbw %%mm7, %%mm2 \n\t" | |
1867 "punpcklbw %%mm7, %%mm3 \n\t" | |
1868 "paddw %%mm1, %%mm5 \n\t" | |
1869 "paddw %%mm3, %%mm2 \n\t" | |
1870 "paddw %%mm5, %%mm2 \n\t" | |
1871 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
1872 "psrlw $2, %%mm4 \n\t" | |
1873 "psrlw $2, %%mm2 \n\t" | |
1874 #endif | |
1875 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" | |
1876 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
1877 | |
1878 "pmaddwd %%mm4, %%mm1 \n\t" | |
1879 "pmaddwd %%mm2, %%mm3 \n\t" | |
1880 "pmaddwd %%mm6, %%mm4 \n\t" | |
1881 "pmaddwd %%mm6, %%mm2 \n\t" | |
1882 #ifndef FAST_BGR2YV12 | |
1883 "psrad $8, %%mm4 \n\t" | |
1884 "psrad $8, %%mm1 \n\t" | |
1885 "psrad $8, %%mm2 \n\t" | |
1886 "psrad $8, %%mm3 \n\t" | |
1887 #endif | |
1888 "packssdw %%mm2, %%mm4 \n\t" | |
1889 "packssdw %%mm3, %%mm1 \n\t" | |
1890 "pmaddwd %%mm5, %%mm4 \n\t" | |
1891 "pmaddwd %%mm5, %%mm1 \n\t" | |
1892 "add $24, %%"REG_b" \n\t" | |
1893 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 | |
1894 "psraw $7, %%mm4 \n\t" | |
1895 | |
1896 "movq %%mm0, %%mm1 \n\t" | |
1897 "punpckldq %%mm4, %%mm0 \n\t" | |
1898 "punpckhdq %%mm4, %%mm1 \n\t" | |
1899 "packsswb %%mm1, %%mm0 \n\t" | |
1900 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" | |
1901 | |
1902 "movd %%mm0, (%2, %%"REG_a") \n\t" | |
1903 "punpckhdq %%mm0, %%mm0 \n\t" | |
1904 "movd %%mm0, (%3, %%"REG_a") \n\t" | |
1905 "add $4, %%"REG_a" \n\t" | |
1906 " js 1b \n\t" | |
1907 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width) | |
1908 : "%"REG_a, "%"REG_b | |
1909 ); | |
1910 #else | |
1911 int i; | |
1912 for(i=0; i<width; i++) | |
1913 { | |
1914 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3]; | |
1915 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4]; | |
1916 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5]; | |
1917 | |
1918 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1919 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1920 } | |
1921 #endif | |
1922 } | |
1923 | |
1924 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width) | |
1925 { | |
1926 int i; | |
1927 for(i=0; i<width; i++) | |
1928 { | |
1929 int d= ((uint16_t*)src)[i]; | |
1930 int b= d&0x1F; | |
1931 int g= (d>>5)&0x3F; | |
1932 int r= (d>>11)&0x1F; | |
1933 | |
1934 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16; | |
1935 } | |
1936 } | |
1937 | |
1938 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1939 { | |
1940 int i; | |
1941 for(i=0; i<width; i++) | |
1942 { | |
1943 int d0= ((uint32_t*)src1)[i]; | |
1944 int d1= ((uint32_t*)src2)[i]; | |
1945 | |
1946 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F); | |
1947 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F); | |
1948 | |
1949 int dh2= (dh>>11) + (dh<<21); | |
1950 int d= dh2 + dl; | |
1951 | |
1952 int b= d&0x7F; | |
1953 int r= (d>>11)&0x7F; | |
1954 int g= d>>21; | |
1955 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128; | |
1956 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128; | |
1957 } | |
1958 } | |
1959 | |
1960 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width) | |
1961 { | |
1962 int i; | |
1963 for(i=0; i<width; i++) | |
1964 { | |
1965 int d= ((uint16_t*)src)[i]; | |
1966 int b= d&0x1F; | |
1967 int g= (d>>5)&0x1F; | |
1968 int r= (d>>10)&0x1F; | |
1969 | |
1970 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16; | |
1971 } | |
1972 } | |
1973 | |
1974 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1975 { | |
1976 int i; | |
1977 for(i=0; i<width; i++) | |
1978 { | |
1979 int d0= ((uint32_t*)src1)[i]; | |
1980 int d1= ((uint32_t*)src2)[i]; | |
1981 | |
1982 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F); | |
1983 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F); | |
1984 | |
1985 int dh2= (dh>>11) + (dh<<21); | |
1986 int d= dh2 + dl; | |
1987 | |
1988 int b= d&0x7F; | |
1989 int r= (d>>10)&0x7F; | |
1990 int g= d>>21; | |
1991 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128; | |
1992 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128; | |
1993 } | |
1994 } | |
1995 | |
1996 | |
1997 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width) | |
1998 { | |
1999 int i; | |
2000 for(i=0; i<width; i++) | |
2001 { | |
2002 int r= ((uint32_t*)src)[i]&0xFF; | |
2003 int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
2004 int b= (((uint32_t*)src)[i]>>16)&0xFF; | |
2005 | |
2006 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); | |
2007 } | |
2008 } | |
2009 | |
2010 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2011 { | |
2012 int i; | |
2013 for(i=0; i<width; i++) | |
2014 { | |
2015 const int a= ((uint32_t*)src1)[2*i+0]; | |
2016 const int e= ((uint32_t*)src1)[2*i+1]; | |
2017 const int c= ((uint32_t*)src2)[2*i+0]; | |
2018 const int d= ((uint32_t*)src2)[2*i+1]; | |
2019 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF); | |
2020 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00); | |
2021 const int r= l&0x3FF; | |
2022 const int g= h>>8; | |
2023 const int b= l>>16; | |
2024 | |
2025 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2026 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2027 } | |
2028 } | |
2029 | |
2030 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width) | |
2031 { | |
2032 int i; | |
2033 for(i=0; i<width; i++) | |
2034 { | |
2035 int r= src[i*3+0]; | |
2036 int g= src[i*3+1]; | |
2037 int b= src[i*3+2]; | |
2038 | |
2039 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); | |
2040 } | |
2041 } | |
2042 | |
2043 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2044 { | |
2045 int i; | |
2046 for(i=0; i<width; i++) | |
2047 { | |
2048 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3]; | |
2049 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4]; | |
2050 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5]; | |
2051 | |
2052 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2053 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2054 } | |
2055 } | |
2056 | |
2057 | |
2058 // Bilinear / Bicubic scaling | |
2059 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, | |
2060 int16_t *filter, int16_t *filterPos, long filterSize) | |
2061 { | |
2062 #ifdef HAVE_MMX | |
2063 assert(filterSize % 4 == 0 && filterSize>0); | |
2064 if(filterSize==4) // allways true for upscaling, sometimes for down too | |
2065 { | |
2066 long counter= -2*dstW; | |
2067 filter-= counter*2; | |
2068 filterPos-= counter/2; | |
2069 dst-= counter/2; | |
2070 asm volatile( | |
2071 "pxor %%mm7, %%mm7 \n\t" | |
2072 "movq "MANGLE(w02)", %%mm6 \n\t" | |
2073 "push %%"REG_BP" \n\t" // we use 7 regs here ... | |
2074 "mov %%"REG_a", %%"REG_BP" \n\t" | |
2075 ASMALIGN16 | |
2076 "1: \n\t" | |
2077 "movzwl (%2, %%"REG_BP"), %%eax \n\t" | |
2078 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t" | |
2079 "movq (%1, %%"REG_BP", 4), %%mm1\n\t" | |
2080 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t" | |
2081 "movd (%3, %%"REG_a"), %%mm0 \n\t" | |
2082 "movd (%3, %%"REG_b"), %%mm2 \n\t" | |
2083 "punpcklbw %%mm7, %%mm0 \n\t" | |
2084 "punpcklbw %%mm7, %%mm2 \n\t" | |
2085 "pmaddwd %%mm1, %%mm0 \n\t" | |
2086 "pmaddwd %%mm2, %%mm3 \n\t" | |
2087 "psrad $8, %%mm0 \n\t" | |
2088 "psrad $8, %%mm3 \n\t" | |
2089 "packssdw %%mm3, %%mm0 \n\t" | |
2090 "pmaddwd %%mm6, %%mm0 \n\t" | |
2091 "packssdw %%mm0, %%mm0 \n\t" | |
2092 "movd %%mm0, (%4, %%"REG_BP") \n\t" | |
2093 "add $4, %%"REG_BP" \n\t" | |
2094 " jnc 1b \n\t" | |
2095 | |
2096 "pop %%"REG_BP" \n\t" | |
2097 : "+a" (counter) | |
2098 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
2099 : "%"REG_b | |
2100 ); | |
2101 } | |
2102 else if(filterSize==8) | |
2103 { | |
2104 long counter= -2*dstW; | |
2105 filter-= counter*4; | |
2106 filterPos-= counter/2; | |
2107 dst-= counter/2; | |
2108 asm volatile( | |
2109 "pxor %%mm7, %%mm7 \n\t" | |
2110 "movq "MANGLE(w02)", %%mm6 \n\t" | |
2111 "push %%"REG_BP" \n\t" // we use 7 regs here ... | |
2112 "mov %%"REG_a", %%"REG_BP" \n\t" | |
2113 ASMALIGN16 | |
2114 "1: \n\t" | |
2115 "movzwl (%2, %%"REG_BP"), %%eax \n\t" | |
2116 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t" | |
2117 "movq (%1, %%"REG_BP", 8), %%mm1\n\t" | |
2118 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t" | |
2119 "movd (%3, %%"REG_a"), %%mm0 \n\t" | |
2120 "movd (%3, %%"REG_b"), %%mm2 \n\t" | |
2121 "punpcklbw %%mm7, %%mm0 \n\t" | |
2122 "punpcklbw %%mm7, %%mm2 \n\t" | |
2123 "pmaddwd %%mm1, %%mm0 \n\t" | |
2124 "pmaddwd %%mm2, %%mm3 \n\t" | |
2125 | |
2126 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t" | |
2127 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t" | |
2128 "movd 4(%3, %%"REG_a"), %%mm4 \n\t" | |
2129 "movd 4(%3, %%"REG_b"), %%mm2 \n\t" | |
2130 "punpcklbw %%mm7, %%mm4 \n\t" | |
2131 "punpcklbw %%mm7, %%mm2 \n\t" | |
2132 "pmaddwd %%mm1, %%mm4 \n\t" | |
2133 "pmaddwd %%mm2, %%mm5 \n\t" | |
2134 "paddd %%mm4, %%mm0 \n\t" | |
2135 "paddd %%mm5, %%mm3 \n\t" | |
2136 | |
2137 "psrad $8, %%mm0 \n\t" | |
2138 "psrad $8, %%mm3 \n\t" | |
2139 "packssdw %%mm3, %%mm0 \n\t" | |
2140 "pmaddwd %%mm6, %%mm0 \n\t" | |
2141 "packssdw %%mm0, %%mm0 \n\t" | |
2142 "movd %%mm0, (%4, %%"REG_BP") \n\t" | |
2143 "add $4, %%"REG_BP" \n\t" | |
2144 " jnc 1b \n\t" | |
2145 | |
2146 "pop %%"REG_BP" \n\t" | |
2147 : "+a" (counter) | |
2148 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
2149 : "%"REG_b | |
2150 ); | |
2151 } | |
2152 else | |
2153 { | |
2154 uint8_t *offset = src+filterSize; | |
2155 long counter= -2*dstW; | |
2156 // filter-= counter*filterSize/2; | |
2157 filterPos-= counter/2; | |
2158 dst-= counter/2; | |
2159 asm volatile( | |
2160 "pxor %%mm7, %%mm7 \n\t" | |
2161 "movq "MANGLE(w02)", %%mm6 \n\t" | |
2162 ASMALIGN16 | |
2163 "1: \n\t" | |
2164 "mov %2, %%"REG_c" \n\t" | |
2165 "movzwl (%%"REG_c", %0), %%eax \n\t" | |
2166 "movzwl 2(%%"REG_c", %0), %%ebx \n\t" | |
2167 "mov %5, %%"REG_c" \n\t" | |
2168 "pxor %%mm4, %%mm4 \n\t" | |
2169 "pxor %%mm5, %%mm5 \n\t" | |
2170 "2: \n\t" | |
2171 "movq (%1), %%mm1 \n\t" | |
2172 "movq (%1, %6), %%mm3 \n\t" | |
2173 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t" | |
2174 "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t" | |
2175 "punpcklbw %%mm7, %%mm0 \n\t" | |
2176 "punpcklbw %%mm7, %%mm2 \n\t" | |
2177 "pmaddwd %%mm1, %%mm0 \n\t" | |
2178 "pmaddwd %%mm2, %%mm3 \n\t" | |
2179 "paddd %%mm3, %%mm5 \n\t" | |
2180 "paddd %%mm0, %%mm4 \n\t" | |
2181 "add $8, %1 \n\t" | |
2182 "add $4, %%"REG_c" \n\t" | |
2183 "cmp %4, %%"REG_c" \n\t" | |
2184 " jb 2b \n\t" | |
2185 "add %6, %1 \n\t" | |
2186 "psrad $8, %%mm4 \n\t" | |
2187 "psrad $8, %%mm5 \n\t" | |
2188 "packssdw %%mm5, %%mm4 \n\t" | |
2189 "pmaddwd %%mm6, %%mm4 \n\t" | |
2190 "packssdw %%mm4, %%mm4 \n\t" | |
2191 "mov %3, %%"REG_a" \n\t" | |
2192 "movd %%mm4, (%%"REG_a", %0) \n\t" | |
2193 "add $4, %0 \n\t" | |
2194 " jnc 1b \n\t" | |
2195 | |
2196 : "+r" (counter), "+r" (filter) | |
2197 : "m" (filterPos), "m" (dst), "m"(offset), | |
2198 "m" (src), "r" (filterSize*2) | |
2199 : "%"REG_b, "%"REG_a, "%"REG_c | |
2200 ); | |
2201 } | |
2202 #else | |
2203 #ifdef HAVE_ALTIVEC | |
2204 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize); | |
2205 #else | |
2206 int i; | |
2207 for(i=0; i<dstW; i++) | |
2208 { | |
2209 int j; | |
2210 int srcPos= filterPos[i]; | |
2211 int val=0; | |
2212 // printf("filterPos: %d\n", filterPos[i]); | |
2213 for(j=0; j<filterSize; j++) | |
2214 { | |
2215 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); | |
2216 val += ((int)src[srcPos + j])*filter[filterSize*i + j]; | |
2217 } | |
2218 // filter += hFilterSize; | |
2219 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ... | |
2220 // dst[i] = val>>7; | |
2221 } | |
2222 #endif | |
2223 #endif | |
2224 } | |
2225 // *** horizontal scale Y line to temp buffer | |
2226 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc, | |
2227 int flags, int canMMX2BeUsed, int16_t *hLumFilter, | |
2228 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, | |
2229 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, | |
2230 int32_t *mmx2FilterPos) | |
2231 { | |
2232 if(srcFormat==IMGFMT_YUY2) | |
2233 { | |
2234 RENAME(yuy2ToY)(formatConvBuffer, src, srcW); | |
2235 src= formatConvBuffer; | |
2236 } | |
2237 else if(srcFormat==IMGFMT_UYVY) | |
2238 { | |
2239 RENAME(uyvyToY)(formatConvBuffer, src, srcW); | |
2240 src= formatConvBuffer; | |
2241 } | |
2242 else if(srcFormat==IMGFMT_BGR32) | |
2243 { | |
2244 RENAME(bgr32ToY)(formatConvBuffer, src, srcW); | |
2245 src= formatConvBuffer; | |
2246 } | |
2247 else if(srcFormat==IMGFMT_BGR24) | |
2248 { | |
2249 RENAME(bgr24ToY)(formatConvBuffer, src, srcW); | |
2250 src= formatConvBuffer; | |
2251 } | |
2252 else if(srcFormat==IMGFMT_BGR16) | |
2253 { | |
2254 RENAME(bgr16ToY)(formatConvBuffer, src, srcW); | |
2255 src= formatConvBuffer; | |
2256 } | |
2257 else if(srcFormat==IMGFMT_BGR15) | |
2258 { | |
2259 RENAME(bgr15ToY)(formatConvBuffer, src, srcW); | |
2260 src= formatConvBuffer; | |
2261 } | |
2262 else if(srcFormat==IMGFMT_RGB32) | |
2263 { | |
2264 RENAME(rgb32ToY)(formatConvBuffer, src, srcW); | |
2265 src= formatConvBuffer; | |
2266 } | |
2267 else if(srcFormat==IMGFMT_RGB24) | |
2268 { | |
2269 RENAME(rgb24ToY)(formatConvBuffer, src, srcW); | |
2270 src= formatConvBuffer; | |
2271 } | |
2272 | |
2273 #ifdef HAVE_MMX | |
2274 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one) | |
2275 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) | |
2276 #else | |
2277 if(!(flags&SWS_FAST_BILINEAR)) | |
2278 #endif | |
2279 { | |
2280 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); | |
2281 } | |
2282 else // Fast Bilinear upscale / crap downscale | |
2283 { | |
2284 #if defined(ARCH_X86) || defined(ARCH_X86_64) | |
2285 #ifdef HAVE_MMX2 | |
2286 int i; | |
2287 if(canMMX2BeUsed) | |
2288 { | |
2289 asm volatile( | |
2290 "pxor %%mm7, %%mm7 \n\t" | |
2291 "mov %0, %%"REG_c" \n\t" | |
2292 "mov %1, %%"REG_D" \n\t" | |
2293 "mov %2, %%"REG_d" \n\t" | |
2294 "mov %3, %%"REG_b" \n\t" | |
2295 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2296 PREFETCH" (%%"REG_c") \n\t" | |
2297 PREFETCH" 32(%%"REG_c") \n\t" | |
2298 PREFETCH" 64(%%"REG_c") \n\t" | |
2299 | |
2300 #ifdef ARCH_X86_64 | |
2301 | |
2302 #define FUNNY_Y_CODE \ | |
2303 "movl (%%"REG_b"), %%esi \n\t"\ | |
2304 "call *%4 \n\t"\ | |
2305 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\ | |
2306 "add %%"REG_S", %%"REG_c" \n\t"\ | |
2307 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2308 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2309 | |
2310 #else | |
2311 | |
2312 #define FUNNY_Y_CODE \ | |
2313 "movl (%%"REG_b"), %%esi \n\t"\ | |
2314 "call *%4 \n\t"\ | |
2315 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\ | |
2316 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2317 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2318 | |
2319 #endif | |
2320 | |
2321 FUNNY_Y_CODE | |
2322 FUNNY_Y_CODE | |
2323 FUNNY_Y_CODE | |
2324 FUNNY_Y_CODE | |
2325 FUNNY_Y_CODE | |
2326 FUNNY_Y_CODE | |
2327 FUNNY_Y_CODE | |
2328 FUNNY_Y_CODE | |
2329 | |
2330 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), | |
2331 "m" (funnyYCode) | |
2332 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D | |
2333 ); | |
2334 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; | |
2335 } | |
2336 else | |
2337 { | |
2338 #endif | |
2339 long xInc_shr16 = xInc >> 16; | |
2340 uint16_t xInc_mask = xInc & 0xffff; | |
2341 //NO MMX just normal asm ... | |
2342 asm volatile( | |
2343 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2344 "xor %%"REG_b", %%"REG_b" \n\t" // xx | |
2345 "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |
2346 ASMALIGN16 | |
2347 "1: \n\t" | |
2348 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx] | |
2349 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1] | |
2350 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2351 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2352 "shll $16, %%edi \n\t" | |
2353 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2354 "mov %1, %%"REG_D" \n\t" | |
2355 "shrl $9, %%esi \n\t" | |
2356 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t" | |
2357 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
2358 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry | |
2359 | |
2360 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx] | |
2361 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1] | |
2362 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2363 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2364 "shll $16, %%edi \n\t" | |
2365 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2366 "mov %1, %%"REG_D" \n\t" | |
2367 "shrl $9, %%esi \n\t" | |
2368 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t" | |
2369 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
2370 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry | |
2371 | |
2372 | |
2373 "add $2, %%"REG_a" \n\t" | |
2374 "cmp %2, %%"REG_a" \n\t" | |
2375 " jb 1b \n\t" | |
2376 | |
2377 | |
2378 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask) | |
2379 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi" | |
2380 ); | |
2381 #ifdef HAVE_MMX2 | |
2382 } //if MMX2 can't be used | |
2383 #endif | |
2384 #else | |
2385 int i; | |
2386 unsigned int xpos=0; | |
2387 for(i=0;i<dstWidth;i++) | |
2388 { | |
2389 register unsigned int xx=xpos>>16; | |
2390 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2391 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; | |
2392 xpos+=xInc; | |
2393 } | |
2394 #endif | |
2395 } | |
2396 } | |
2397 | |
2398 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2, | |
2399 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, | |
2400 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, | |
2401 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, | |
2402 int32_t *mmx2FilterPos) | |
2403 { | |
2404 if(srcFormat==IMGFMT_YUY2) | |
2405 { | |
2406 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2407 src1= formatConvBuffer; | |
2408 src2= formatConvBuffer+2048; | |
2409 } | |
2410 else if(srcFormat==IMGFMT_UYVY) | |
2411 { | |
2412 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2413 src1= formatConvBuffer; | |
2414 src2= formatConvBuffer+2048; | |
2415 } | |
2416 else if(srcFormat==IMGFMT_BGR32) | |
2417 { | |
2418 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2419 src1= formatConvBuffer; | |
2420 src2= formatConvBuffer+2048; | |
2421 } | |
2422 else if(srcFormat==IMGFMT_BGR24) | |
2423 { | |
2424 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2425 src1= formatConvBuffer; | |
2426 src2= formatConvBuffer+2048; | |
2427 } | |
2428 else if(srcFormat==IMGFMT_BGR16) | |
2429 { | |
2430 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2431 src1= formatConvBuffer; | |
2432 src2= formatConvBuffer+2048; | |
2433 } | |
2434 else if(srcFormat==IMGFMT_BGR15) | |
2435 { | |
2436 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2437 src1= formatConvBuffer; | |
2438 src2= formatConvBuffer+2048; | |
2439 } | |
2440 else if(srcFormat==IMGFMT_RGB32) | |
2441 { | |
2442 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2443 src1= formatConvBuffer; | |
2444 src2= formatConvBuffer+2048; | |
2445 } | |
2446 else if(srcFormat==IMGFMT_RGB24) | |
2447 { | |
2448 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2449 src1= formatConvBuffer; | |
2450 src2= formatConvBuffer+2048; | |
2451 } | |
2452 else if(isGray(srcFormat)) | |
2453 { | |
2454 return; | |
2455 } | |
2456 | |
2457 #ifdef HAVE_MMX | |
2458 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one) | |
2459 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) | |
2460 #else | |
2461 if(!(flags&SWS_FAST_BILINEAR)) | |
2462 #endif | |
2463 { | |
2464 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2465 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2466 } | |
2467 else // Fast Bilinear upscale / crap downscale | |
2468 { | |
2469 #if defined(ARCH_X86) || defined(ARCH_X86_64) | |
2470 #ifdef HAVE_MMX2 | |
2471 int i; | |
2472 if(canMMX2BeUsed) | |
2473 { | |
2474 asm volatile( | |
2475 "pxor %%mm7, %%mm7 \n\t" | |
2476 "mov %0, %%"REG_c" \n\t" | |
2477 "mov %1, %%"REG_D" \n\t" | |
2478 "mov %2, %%"REG_d" \n\t" | |
2479 "mov %3, %%"REG_b" \n\t" | |
2480 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2481 PREFETCH" (%%"REG_c") \n\t" | |
2482 PREFETCH" 32(%%"REG_c") \n\t" | |
2483 PREFETCH" 64(%%"REG_c") \n\t" | |
2484 | |
2485 #ifdef ARCH_X86_64 | |
2486 | |
2487 #define FUNNY_UV_CODE \ | |
2488 "movl (%%"REG_b"), %%esi \n\t"\ | |
2489 "call *%4 \n\t"\ | |
2490 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\ | |
2491 "add %%"REG_S", %%"REG_c" \n\t"\ | |
2492 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2493 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2494 | |
2495 #else | |
2496 | |
2497 #define FUNNY_UV_CODE \ | |
2498 "movl (%%"REG_b"), %%esi \n\t"\ | |
2499 "call *%4 \n\t"\ | |
2500 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\ | |
2501 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2502 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2503 | |
2504 #endif | |
2505 | |
2506 FUNNY_UV_CODE | |
2507 FUNNY_UV_CODE | |
2508 FUNNY_UV_CODE | |
2509 FUNNY_UV_CODE | |
2510 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2511 "mov %5, %%"REG_c" \n\t" // src | |
2512 "mov %1, %%"REG_D" \n\t" // buf1 | |
2513 "add $4096, %%"REG_D" \n\t" | |
2514 PREFETCH" (%%"REG_c") \n\t" | |
2515 PREFETCH" 32(%%"REG_c") \n\t" | |
2516 PREFETCH" 64(%%"REG_c") \n\t" | |
2517 | |
2518 FUNNY_UV_CODE | |
2519 FUNNY_UV_CODE | |
2520 FUNNY_UV_CODE | |
2521 FUNNY_UV_CODE | |
2522 | |
2523 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), | |
2524 "m" (funnyUVCode), "m" (src2) | |
2525 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D | |
2526 ); | |
2527 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) | |
2528 { | |
2529 // printf("%d %d %d\n", dstWidth, i, srcW); | |
2530 dst[i] = src1[srcW-1]*128; | |
2531 dst[i+2048] = src2[srcW-1]*128; | |
2532 } | |
2533 } | |
2534 else | |
2535 { | |
2536 #endif | |
2537 long xInc_shr16 = (long) (xInc >> 16); | |
2538 uint16_t xInc_mask = xInc & 0xffff; | |
2539 asm volatile( | |
2540 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2541 "xor %%"REG_b", %%"REG_b" \n\t" // xx | |
2542 "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |
2543 ASMALIGN16 | |
2544 "1: \n\t" | |
2545 "mov %0, %%"REG_S" \n\t" | |
2546 "movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx] | |
2547 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi \n\t" //src[xx+1] | |
2548 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2549 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2550 "shll $16, %%edi \n\t" | |
2551 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2552 "mov %1, %%"REG_D" \n\t" | |
2553 "shrl $9, %%esi \n\t" | |
2554 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t" | |
2555 | |
2556 "movzbl (%5, %%"REG_b"), %%edi \n\t" //src[xx] | |
2557 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1] | |
2558 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2559 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2560 "shll $16, %%edi \n\t" | |
2561 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2562 "mov %1, %%"REG_D" \n\t" | |
2563 "shrl $9, %%esi \n\t" | |
2564 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t" | |
2565 | |
2566 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
2567 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry | |
2568 "add $1, %%"REG_a" \n\t" | |
2569 "cmp %2, %%"REG_a" \n\t" | |
2570 " jb 1b \n\t" | |
2571 | |
2572 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, | |
2573 which is needed to support GCC-4.0 */ | |
2574 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4)) | |
2575 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), | |
2576 #else | |
2577 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), | |
2578 #endif | |
2579 "r" (src2) | |
2580 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi" | |
2581 ); | |
2582 #ifdef HAVE_MMX2 | |
2583 } //if MMX2 can't be used | |
2584 #endif | |
2585 #else | |
2586 int i; | |
2587 unsigned int xpos=0; | |
2588 for(i=0;i<dstWidth;i++) | |
2589 { | |
2590 register unsigned int xx=xpos>>16; | |
2591 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2592 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |
2593 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | |
2594 /* slower | |
2595 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; | |
2596 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; | |
2597 */ | |
2598 xpos+=xInc; | |
2599 } | |
2600 #endif | |
2601 } | |
2602 } | |
2603 | |
2604 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | |
2605 int srcSliceH, uint8_t* dst[], int dstStride[]){ | |
2606 | |
2607 /* load a few things into local vars to make the code more readable? and faster */ | |
2608 const int srcW= c->srcW; | |
2609 const int dstW= c->dstW; | |
2610 const int dstH= c->dstH; | |
2611 const int chrDstW= c->chrDstW; | |
2612 const int chrSrcW= c->chrSrcW; | |
2613 const int lumXInc= c->lumXInc; | |
2614 const int chrXInc= c->chrXInc; | |
2615 const int dstFormat= c->dstFormat; | |
2616 const int srcFormat= c->srcFormat; | |
2617 const int flags= c->flags; | |
2618 const int canMMX2BeUsed= c->canMMX2BeUsed; | |
2619 int16_t *vLumFilterPos= c->vLumFilterPos; | |
2620 int16_t *vChrFilterPos= c->vChrFilterPos; | |
2621 int16_t *hLumFilterPos= c->hLumFilterPos; | |
2622 int16_t *hChrFilterPos= c->hChrFilterPos; | |
2623 int16_t *vLumFilter= c->vLumFilter; | |
2624 int16_t *vChrFilter= c->vChrFilter; | |
2625 int16_t *hLumFilter= c->hLumFilter; | |
2626 int16_t *hChrFilter= c->hChrFilter; | |
2627 int32_t *lumMmxFilter= c->lumMmxFilter; | |
2628 int32_t *chrMmxFilter= c->chrMmxFilter; | |
2629 const int vLumFilterSize= c->vLumFilterSize; | |
2630 const int vChrFilterSize= c->vChrFilterSize; | |
2631 const int hLumFilterSize= c->hLumFilterSize; | |
2632 const int hChrFilterSize= c->hChrFilterSize; | |
2633 int16_t **lumPixBuf= c->lumPixBuf; | |
2634 int16_t **chrPixBuf= c->chrPixBuf; | |
2635 const int vLumBufSize= c->vLumBufSize; | |
2636 const int vChrBufSize= c->vChrBufSize; | |
2637 uint8_t *funnyYCode= c->funnyYCode; | |
2638 uint8_t *funnyUVCode= c->funnyUVCode; | |
2639 uint8_t *formatConvBuffer= c->formatConvBuffer; | |
2640 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample; | |
2641 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample); | |
2642 int lastDstY; | |
2643 | |
2644 /* vars whch will change and which we need to storw back in the context */ | |
2645 int dstY= c->dstY; | |
2646 int lumBufIndex= c->lumBufIndex; | |
2647 int chrBufIndex= c->chrBufIndex; | |
2648 int lastInLumBuf= c->lastInLumBuf; | |
2649 int lastInChrBuf= c->lastInChrBuf; | |
2650 | |
2651 if(isPacked(c->srcFormat)){ | |
2652 src[0]= | |
2653 src[1]= | |
2654 src[2]= src[0]; | |
2655 srcStride[0]= | |
2656 srcStride[1]= | |
2657 srcStride[2]= srcStride[0]; | |
2658 } | |
2659 srcStride[1]<<= c->vChrDrop; | |
2660 srcStride[2]<<= c->vChrDrop; | |
2661 | |
2662 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2], | |
2663 // (int)dst[0], (int)dst[1], (int)dst[2]); | |
2664 | |
2665 #if 0 //self test FIXME move to a vfilter or something | |
2666 { | |
2667 static volatile int i=0; | |
2668 i++; | |
2669 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH) | |
2670 selfTest(src, srcStride, c->srcW, c->srcH); | |
2671 i--; | |
2672 } | |
2673 #endif | |
2674 | |
2675 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2], | |
2676 //dstStride[0],dstStride[1],dstStride[2]); | |
2677 | |
2678 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0) | |
2679 { | |
2680 static int firstTime=1; //FIXME move this into the context perhaps | |
2681 if(flags & SWS_PRINT_INFO && firstTime) | |
2682 { | |
2683 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n" | |
2684 "SwScaler: ->cannot do aligned memory acesses anymore\n"); | |
2685 firstTime=0; | |
2686 } | |
2687 } | |
2688 | |
2689 /* Note the user might start scaling the picture in the middle so this will not get executed | |
2690 this is not really intended but works currently, so ppl might do it */ | |
2691 if(srcSliceY ==0){ | |
2692 lumBufIndex=0; | |
2693 chrBufIndex=0; | |
2694 dstY=0; | |
2695 lastInLumBuf= -1; | |
2696 lastInChrBuf= -1; | |
2697 } | |
2698 | |
2699 lastDstY= dstY; | |
2700 | |
2701 for(;dstY < dstH; dstY++){ | |
2702 unsigned char *dest =dst[0]+dstStride[0]*dstY; | |
2703 const int chrDstY= dstY>>c->chrDstVSubSample; | |
2704 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY; | |
2705 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY; | |
2706 | |
2707 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input | |
2708 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input | |
2709 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input | |
2710 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input | |
2711 | |
2712 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n", | |
2713 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample); | |
2714 //handle holes (FAST_BILINEAR & weird filters) | |
2715 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; | |
2716 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; | |
2717 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize); | |
2718 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1) | |
2719 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1) | |
2720 | |
2721 // Do we have enough lines in this slice to output the dstY line | |
2722 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample)) | |
2723 { | |
2724 //Do horizontal scaling | |
2725 while(lastInLumBuf < lastLumSrcY) | |
2726 { | |
2727 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; | |
2728 lumBufIndex++; | |
2729 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY); | |
2730 ASSERT(lumBufIndex < 2*vLumBufSize) | |
2731 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
2732 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
2733 // printf("%d %d\n", lumBufIndex, vLumBufSize); | |
2734 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, | |
2735 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, | |
2736 funnyYCode, c->srcFormat, formatConvBuffer, | |
2737 c->lumMmx2Filter, c->lumMmx2FilterPos); | |
2738 lastInLumBuf++; | |
2739 } | |
2740 while(lastInChrBuf < lastChrSrcY) | |
2741 { | |
2742 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; | |
2743 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; | |
2744 chrBufIndex++; | |
2745 ASSERT(chrBufIndex < 2*vChrBufSize) | |
2746 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)) | |
2747 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0) | |
2748 //FIXME replace parameters through context struct (some at least) | |
2749 | |
2750 if(!(isGray(srcFormat) || isGray(dstFormat))) | |
2751 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, | |
2752 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, | |
2753 funnyUVCode, c->srcFormat, formatConvBuffer, | |
2754 c->chrMmx2Filter, c->chrMmx2FilterPos); | |
2755 lastInChrBuf++; | |
2756 } | |
2757 //wrap buf index around to stay inside the ring buffer | |
2758 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
2759 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
2760 } | |
2761 else // not enough lines left in this slice -> load the rest in the buffer | |
2762 { | |
2763 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n", | |
2764 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY, | |
2765 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize, | |
2766 vChrBufSize, vLumBufSize);*/ | |
2767 | |
2768 //Do horizontal scaling | |
2769 while(lastInLumBuf+1 < srcSliceY + srcSliceH) | |
2770 { | |
2771 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; | |
2772 lumBufIndex++; | |
2773 ASSERT(lumBufIndex < 2*vLumBufSize) | |
2774 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
2775 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
2776 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, | |
2777 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, | |
2778 funnyYCode, c->srcFormat, formatConvBuffer, | |
2779 c->lumMmx2Filter, c->lumMmx2FilterPos); | |
2780 lastInLumBuf++; | |
2781 } | |
2782 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH)) | |
2783 { | |
2784 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; | |
2785 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; | |
2786 chrBufIndex++; | |
2787 ASSERT(chrBufIndex < 2*vChrBufSize) | |
2788 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH) | |
2789 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0) | |
2790 | |
2791 if(!(isGray(srcFormat) || isGray(dstFormat))) | |
2792 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, | |
2793 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, | |
2794 funnyUVCode, c->srcFormat, formatConvBuffer, | |
2795 c->chrMmx2Filter, c->chrMmx2FilterPos); | |
2796 lastInChrBuf++; | |
2797 } | |
2798 //wrap buf index around to stay inside the ring buffer | |
2799 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
2800 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
2801 break; //we can't output a dstY line so let's try with the next slice | |
2802 } | |
2803 | |
2804 #ifdef HAVE_MMX | |
2805 b5Dither= dither8[dstY&1]; | |
2806 g6Dither= dither4[dstY&1]; | |
2807 g5Dither= dither8[dstY&1]; | |
2808 r5Dither= dither8[(dstY+1)&1]; | |
2809 #endif | |
2810 if(dstY < dstH-2) | |
2811 { | |
2812 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
2813 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
2814 #ifdef HAVE_MMX | |
2815 int i; | |
2816 for(i=0; i<vLumFilterSize; i++) | |
2817 { | |
2818 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i]; | |
2819 lumMmxFilter[4*i+2]= | |
2820 lumMmxFilter[4*i+3]= | |
2821 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; | |
2822 } | |
2823 for(i=0; i<vChrFilterSize; i++) | |
2824 { | |
2825 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i]; | |
2826 chrMmxFilter[4*i+2]= | |
2827 chrMmxFilter[4*i+3]= | |
2828 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; | |
2829 } | |
2830 #endif | |
2831 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){ | |
2832 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
2833 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi | |
2834 RENAME(yuv2nv12X)(c, | |
2835 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2836 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2837 dest, uDest, dstW, chrDstW, dstFormat); | |
2838 } | |
2839 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like | |
2840 { | |
2841 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
2842 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
2843 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12 | |
2844 { | |
2845 int16_t *lumBuf = lumPixBuf[0]; | |
2846 int16_t *chrBuf= chrPixBuf[0]; | |
2847 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW); | |
2848 } | |
2849 else //General YV12 | |
2850 { | |
2851 RENAME(yuv2yuvX)(c, | |
2852 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2853 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2854 dest, uDest, vDest, dstW, chrDstW); | |
2855 } | |
2856 } | |
2857 else | |
2858 { | |
2859 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
2860 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
2861 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB | |
2862 { | |
2863 int chrAlpha= vChrFilter[2*dstY+1]; | |
2864 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), | |
2865 dest, dstW, chrAlpha, dstFormat, flags, dstY); | |
2866 } | |
2867 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB | |
2868 { | |
2869 int lumAlpha= vLumFilter[2*dstY+1]; | |
2870 int chrAlpha= vChrFilter[2*dstY+1]; | |
2871 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), | |
2872 dest, dstW, lumAlpha, chrAlpha, dstY); | |
2873 } | |
2874 else //General RGB | |
2875 { | |
2876 RENAME(yuv2packedX)(c, | |
2877 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
2878 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2879 dest, dstW, dstY); | |
2880 } | |
2881 } | |
2882 } | |
2883 else // hmm looks like we can't use MMX here without overwriting this array's tail | |
2884 { | |
2885 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
2886 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
2887 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){ | |
2888 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
2889 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi | |
2890 yuv2nv12XinC( | |
2891 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2892 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2893 dest, uDest, dstW, chrDstW, dstFormat); | |
2894 } | |
2895 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 | |
2896 { | |
2897 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
2898 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
2899 yuv2yuvXinC( | |
2900 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2901 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2902 dest, uDest, vDest, dstW, chrDstW); | |
2903 } | |
2904 else | |
2905 { | |
2906 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
2907 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
2908 yuv2packedXinC(c, | |
2909 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
2910 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2911 dest, dstW, dstY); | |
2912 } | |
2913 } | |
2914 } | |
2915 | |
2916 #ifdef HAVE_MMX | |
2917 __asm __volatile(SFENCE:::"memory"); | |
2918 __asm __volatile(EMMS:::"memory"); | |
2919 #endif | |
2920 /* store changed local vars back in the context */ | |
2921 c->dstY= dstY; | |
2922 c->lumBufIndex= lumBufIndex; | |
2923 c->chrBufIndex= chrBufIndex; | |
2924 c->lastInLumBuf= lastInLumBuf; | |
2925 c->lastInChrBuf= lastInChrBuf; | |
2926 | |
2927 return dstY - lastDstY; | |
2928 } |