Mercurial > mplayer.hg
annotate postproc/swscale_template.c @ 4785:fefd14e3072f
new input cucc is default
author | pontscho |
---|---|
date | Thu, 21 Feb 2002 13:31:05 +0000 |
parents | 319d80378895 |
children | 68e7ed0c22be |
rev | line source |
---|---|
4295 | 1 /* |
2 Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at> | |
2216 | 3 |
4295 | 4 This program is free software; you can redistribute it and/or modify |
5 it under the terms of the GNU General Public License as published by | |
6 the Free Software Foundation; either version 2 of the License, or | |
7 (at your option) any later version. | |
2216 | 8 |
4295 | 9 This program is distributed in the hope that it will be useful, |
10 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 GNU General Public License for more details. | |
13 | |
14 You should have received a copy of the GNU General Public License | |
15 along with this program; if not, write to the Free Software | |
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
17 */ | |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
18 |
2540 | 19 #undef MOVNTQ |
2680 | 20 #undef PAVGB |
3136 | 21 #undef PREFETCH |
22 #undef PREFETCHW | |
23 #undef EMMS | |
24 #undef SFENCE | |
25 | |
26 #ifdef HAVE_3DNOW | |
27 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
28 #define EMMS "femms" | |
29 #else | |
30 #define EMMS "emms" | |
31 #endif | |
32 | |
33 #ifdef HAVE_3DNOW | |
34 #define PREFETCH "prefetch" | |
35 #define PREFETCHW "prefetchw" | |
36 #elif defined ( HAVE_MMX2 ) | |
37 #define PREFETCH "prefetchnta" | |
38 #define PREFETCHW "prefetcht0" | |
39 #else | |
40 #define PREFETCH "/nop" | |
41 #define PREFETCHW "/nop" | |
42 #endif | |
43 | |
44 #ifdef HAVE_MMX2 | |
45 #define SFENCE "sfence" | |
46 #else | |
47 #define SFENCE "/nop" | |
48 #endif | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
49 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
50 #ifdef HAVE_MMX2 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
51 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
52 #elif defined (HAVE_3DNOW) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
53 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
54 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
55 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
56 #ifdef HAVE_MMX2 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
57 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
58 #else |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
59 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
60 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
61 |
3344 | 62 |
63 #define YSCALEYUV2YV12X(x) \ | |
64 "xorl %%eax, %%eax \n\t"\ | |
65 "pxor %%mm3, %%mm3 \n\t"\ | |
66 "pxor %%mm4, %%mm4 \n\t"\ | |
67 "movl %0, %%edx \n\t"\ | |
68 ".balign 16 \n\t" /* FIXME Unroll? */\ | |
69 "1: \n\t"\ | |
70 "movl (%1, %%edx, 4), %%esi \n\t"\ | |
71 "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\ | |
72 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\ | |
73 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\ | |
74 "pmulhw %%mm0, %%mm2 \n\t"\ | |
75 "pmulhw %%mm0, %%mm5 \n\t"\ | |
76 "paddw %%mm2, %%mm3 \n\t"\ | |
77 "paddw %%mm5, %%mm4 \n\t"\ | |
78 "addl $1, %%edx \n\t"\ | |
79 " jnz 1b \n\t"\ | |
80 "psraw $3, %%mm3 \n\t"\ | |
81 "psraw $3, %%mm4 \n\t"\ | |
82 "packuswb %%mm4, %%mm3 \n\t"\ | |
83 MOVNTQ(%%mm3, (%3, %%eax))\ | |
84 "addl $8, %%eax \n\t"\ | |
85 "cmpl %4, %%eax \n\t"\ | |
86 "pxor %%mm3, %%mm3 \n\t"\ | |
87 "pxor %%mm4, %%mm4 \n\t"\ | |
88 "movl %0, %%edx \n\t"\ | |
89 "jb 1b \n\t" | |
90 | |
91 #define YSCALEYUV2YV121 \ | |
92 "movl %2, %%eax \n\t"\ | |
93 ".balign 16 \n\t" /* FIXME Unroll? */\ | |
94 "1: \n\t"\ | |
95 "movq (%0, %%eax, 2), %%mm0 \n\t"\ | |
96 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\ | |
97 "psraw $7, %%mm0 \n\t"\ | |
98 "psraw $7, %%mm1 \n\t"\ | |
99 "packuswb %%mm1, %%mm0 \n\t"\ | |
100 MOVNTQ(%%mm0, (%1, %%eax))\ | |
101 "addl $8, %%eax \n\t"\ | |
102 "jnc 1b \n\t" | |
103 | |
104 /* | |
105 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
106 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
107 "r" (dest), "m" (dstW), | |
108 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
109 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
110 */ | |
111 #define YSCALEYUV2RGBX \ | |
112 "xorl %%eax, %%eax \n\t"\ | |
113 ".balign 16 \n\t"\ | |
114 "1: \n\t"\ | |
115 "movl %1, %%edx \n\t" /* -chrFilterSize */\ | |
116 "movl %3, %%ebx \n\t" /* chrMmxFilter+lumFilterSize */\ | |
117 "movl %7, %%ecx \n\t" /* chrSrc+lumFilterSize */\ | |
118 "pxor %%mm3, %%mm3 \n\t"\ | |
119 "pxor %%mm4, %%mm4 \n\t"\ | |
120 "2: \n\t"\ | |
121 "movl (%%ecx, %%edx, 4), %%esi \n\t"\ | |
122 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\ | |
123 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\ | |
124 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\ | |
125 "pmulhw %%mm0, %%mm2 \n\t"\ | |
126 "pmulhw %%mm0, %%mm5 \n\t"\ | |
127 "paddw %%mm2, %%mm3 \n\t"\ | |
128 "paddw %%mm5, %%mm4 \n\t"\ | |
129 "addl $1, %%edx \n\t"\ | |
130 " jnz 2b \n\t"\ | |
131 \ | |
132 "movl %0, %%edx \n\t" /* -lumFilterSize */\ | |
133 "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\ | |
134 "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\ | |
135 "pxor %%mm1, %%mm1 \n\t"\ | |
136 "pxor %%mm7, %%mm7 \n\t"\ | |
137 "2: \n\t"\ | |
138 "movl (%%ecx, %%edx, 4), %%esi \n\t"\ | |
139 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\ | |
140 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\ | |
141 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\ | |
142 "pmulhw %%mm0, %%mm2 \n\t"\ | |
143 "pmulhw %%mm0, %%mm5 \n\t"\ | |
144 "paddw %%mm2, %%mm1 \n\t"\ | |
145 "paddw %%mm5, %%mm7 \n\t"\ | |
146 "addl $1, %%edx \n\t"\ | |
147 " jnz 2b \n\t"\ | |
148 \ | |
4248 | 149 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\ |
150 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\ | |
3344 | 151 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
152 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
4248 | 153 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\ |
154 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\ | |
3344 | 155 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
4248 | 156 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\ |
157 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\ | |
158 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\ | |
159 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\ | |
160 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\ | |
161 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\ | |
3344 | 162 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
163 "paddw %%mm3, %%mm4 \n\t"\ | |
164 "movq %%mm2, %%mm0 \n\t"\ | |
165 "movq %%mm5, %%mm6 \n\t"\ | |
166 "movq %%mm4, %%mm3 \n\t"\ | |
167 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
168 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
169 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
170 "paddw %%mm1, %%mm2 \n\t"\ | |
171 "paddw %%mm1, %%mm5 \n\t"\ | |
172 "paddw %%mm1, %%mm4 \n\t"\ | |
173 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
174 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
175 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
176 "paddw %%mm7, %%mm0 \n\t"\ | |
177 "paddw %%mm7, %%mm6 \n\t"\ | |
178 "paddw %%mm7, %%mm3 \n\t"\ | |
179 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
180 "packuswb %%mm0, %%mm2 \n\t"\ | |
181 "packuswb %%mm6, %%mm5 \n\t"\ | |
182 "packuswb %%mm3, %%mm4 \n\t"\ | |
183 "pxor %%mm7, %%mm7 \n\t" | |
184 | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
185 #define FULL_YSCALEYUV2RGB \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
186 "pxor %%mm7, %%mm7 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
187 "movd %6, %%mm6 \n\t" /*yalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
188 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
189 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
190 "movd %7, %%mm5 \n\t" /*uvalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
191 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
192 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
193 "xorl %%eax, %%eax \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
194 ".balign 16 \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
195 "1: \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
196 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
197 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
198 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
199 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
200 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
201 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
202 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
203 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
204 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
205 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
206 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
207 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
208 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
209 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
210 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ |
4248 | 211 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\ |
212 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\ | |
213 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
214 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
215 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
216 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
217 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
4248 | 218 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
219 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
4248 | 220 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
221 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
4248 | 222 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
223 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
224 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
225 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ |
4248 | 226 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\ |
227 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
228 "paddw %%mm1, %%mm3 \n\t" /* B*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
229 "paddw %%mm1, %%mm0 \n\t" /* R*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
230 "packuswb %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
231 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
232 "packuswb %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
233 "paddw %%mm4, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
234 "paddw %%mm2, %%mm1 \n\t" /* G*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
235 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
236 "packuswb %%mm1, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
237 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
238 #define YSCALEYUV2RGB \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
239 "movd %6, %%mm6 \n\t" /*yalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
240 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
241 "punpcklwd %%mm6, %%mm6 \n\t"\ |
4248 | 242 "movq %%mm6, "MANGLE(asm_yalpha1)"\n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
243 "movd %7, %%mm5 \n\t" /*uvalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
244 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
245 "punpcklwd %%mm5, %%mm5 \n\t"\ |
4248 | 246 "movq %%mm5, "MANGLE(asm_uvalpha1)"\n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
247 "xorl %%eax, %%eax \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
248 ".balign 16 \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
249 "1: \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
250 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
251 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
252 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
253 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
254 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
255 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ |
4248 | 256 "movq "MANGLE(asm_uvalpha1)", %%mm0\n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
257 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
258 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
259 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
260 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
261 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
262 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
4248 | 263 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\ |
264 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
265 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
266 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
4248 | 267 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\ |
268 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
269 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
270 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
271 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
272 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
273 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
274 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
275 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ |
4248 | 276 "pmulhw "MANGLE(asm_yalpha1)", %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
277 "pmulhw "MANGLE(asm_yalpha1)", %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
278 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
279 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
280 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
281 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
4248 | 282 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\ |
283 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\ | |
284 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\ | |
285 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\ | |
286 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\ | |
287 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
288 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
289 "paddw %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
290 "movq %%mm2, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
291 "movq %%mm5, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
292 "movq %%mm4, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
293 "punpcklwd %%mm2, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
294 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
295 "punpcklwd %%mm4, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
296 "paddw %%mm1, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
297 "paddw %%mm1, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
298 "paddw %%mm1, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
299 "punpckhwd %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
300 "punpckhwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
301 "punpckhwd %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
302 "paddw %%mm7, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
303 "paddw %%mm7, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
304 "paddw %%mm7, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
305 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
306 "packuswb %%mm0, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
307 "packuswb %%mm6, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
308 "packuswb %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
309 "pxor %%mm7, %%mm7 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
310 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
311 #define YSCALEYUV2RGB1 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
312 "xorl %%eax, %%eax \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
313 ".balign 16 \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
314 "1: \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
315 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
316 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
317 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
318 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
4248 | 319 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\ |
320 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
321 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
322 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
4248 | 323 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\ |
324 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
325 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
326 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
327 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
328 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
329 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
4248 | 330 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\ |
331 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\ | |
332 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\ | |
333 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\ | |
334 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\ | |
335 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
336 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
337 "paddw %%mm3, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
338 "movq %%mm2, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
339 "movq %%mm5, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
340 "movq %%mm4, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
341 "punpcklwd %%mm2, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
342 "punpcklwd %%mm5, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
343 "punpcklwd %%mm4, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
344 "paddw %%mm1, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
345 "paddw %%mm1, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
346 "paddw %%mm1, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
347 "punpckhwd %%mm0, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
348 "punpckhwd %%mm6, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
349 "punpckhwd %%mm3, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
350 "paddw %%mm7, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
351 "paddw %%mm7, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
352 "paddw %%mm7, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
353 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
354 "packuswb %%mm0, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
355 "packuswb %%mm6, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
356 "packuswb %%mm3, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
357 "pxor %%mm7, %%mm7 \n\t" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
358 |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
359 // do vertical chrominance interpolation |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
360 #define YSCALEYUV2RGB1b \ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
361 "xorl %%eax, %%eax \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
362 ".balign 16 \n\t"\ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
363 "1: \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
364 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
365 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
366 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
367 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ |
2576 | 368 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
369 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
3344 | 370 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ |
371 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ | |
4248 | 372 "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\ |
373 "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
374 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
375 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
4248 | 376 "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\ |
377 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
378 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
379 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
380 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
381 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
382 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
4248 | 383 "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\ |
384 "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\ | |
385 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\ | |
386 "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\ | |
387 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\ | |
388 "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
389 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
390 "paddw %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
391 "movq %%mm2, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
392 "movq %%mm5, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
393 "movq %%mm4, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
394 "punpcklwd %%mm2, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
395 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
396 "punpcklwd %%mm4, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
397 "paddw %%mm1, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
398 "paddw %%mm1, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
399 "paddw %%mm1, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
400 "punpckhwd %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
401 "punpckhwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
402 "punpckhwd %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
403 "paddw %%mm7, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
404 "paddw %%mm7, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
405 "paddw %%mm7, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
406 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
407 "packuswb %%mm0, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
408 "packuswb %%mm6, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
409 "packuswb %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
410 "pxor %%mm7, %%mm7 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
411 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
412 #define WRITEBGR32 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
413 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
414 "movq %%mm2, %%mm1 \n\t" /* B */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
415 "movq %%mm5, %%mm6 \n\t" /* R */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
416 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
417 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
418 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
419 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
420 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
421 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
422 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
423 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
424 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
425 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
426 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
427 MOVNTQ(%%mm0, (%4, %%eax, 4))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
428 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
429 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
430 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
431 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
432 "addl $8, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
433 "cmpl %5, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
434 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
435 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
436 #define WRITEBGR16 \ |
4248 | 437 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
438 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ | |
439 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
2669 | 440 "psrlq $3, %%mm2 \n\t"\ |
441 \ | |
442 "movq %%mm2, %%mm1 \n\t"\ | |
443 "movq %%mm4, %%mm3 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
444 \ |
2669 | 445 "punpcklbw %%mm7, %%mm3 \n\t"\ |
446 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
447 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
448 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
449 \ |
2669 | 450 "psllq $3, %%mm3 \n\t"\ |
451 "psllq $3, %%mm4 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
452 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
453 "por %%mm3, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
454 "por %%mm4, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
455 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
456 MOVNTQ(%%mm2, (%4, %%eax, 2))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
457 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
458 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
459 "addl $8, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
460 "cmpl %5, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
461 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
462 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
463 #define WRITEBGR15 \ |
4248 | 464 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
465 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ | |
466 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
2669 | 467 "psrlq $3, %%mm2 \n\t"\ |
468 "psrlq $1, %%mm5 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
469 \ |
2669 | 470 "movq %%mm2, %%mm1 \n\t"\ |
471 "movq %%mm4, %%mm3 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
472 \ |
2669 | 473 "punpcklbw %%mm7, %%mm3 \n\t"\ |
474 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
475 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
476 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
477 \ |
2669 | 478 "psllq $2, %%mm3 \n\t"\ |
479 "psllq $2, %%mm4 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
480 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
481 "por %%mm3, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
482 "por %%mm4, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
483 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
484 MOVNTQ(%%mm2, (%4, %%eax, 2))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
485 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
486 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
487 "addl $8, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
488 "cmpl %5, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
489 " jb 1b \n\t" |
2669 | 490 |
2730 | 491 #define WRITEBGR24OLD \ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
492 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
493 "movq %%mm2, %%mm1 \n\t" /* B */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
494 "movq %%mm5, %%mm6 \n\t" /* R */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
495 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
496 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
497 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
498 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
499 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
500 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ |
2326 | 501 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ |
502 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
503 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
504 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
505 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
506 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
507 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ |
4248 | 508 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\ |
509 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
510 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
511 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
512 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
513 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
514 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
515 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
516 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
517 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
518 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ |
4248 | 519 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
520 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
521 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ |
4248 | 522 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\ |
523 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
524 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
525 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
526 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
527 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
528 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
529 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
530 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
531 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ |
4248 | 532 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\ |
533 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
534 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
535 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
536 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
537 \ |
2728 | 538 MOVNTQ(%%mm0, (%%ebx))\ |
539 MOVNTQ(%%mm2, 8(%%ebx))\ | |
540 MOVNTQ(%%mm3, 16(%%ebx))\ | |
541 "addl $24, %%ebx \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
542 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
543 "addl $8, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
544 "cmpl %5, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
545 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
546 |
2730 | 547 #define WRITEBGR24MMX \ |
548 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
549 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
550 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
551 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
552 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
553 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
554 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
555 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
556 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
557 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
558 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
559 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
560 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
561 \ | |
562 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
563 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ | |
564 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ | |
565 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ | |
566 \ | |
567 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ | |
568 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ | |
569 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ | |
570 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ | |
571 \ | |
572 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ | |
573 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ | |
574 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ | |
575 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ | |
576 \ | |
577 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
578 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ | |
579 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ | |
580 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
581 MOVNTQ(%%mm0, (%%ebx))\ | |
582 \ | |
583 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ | |
584 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ | |
585 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ | |
586 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ | |
587 MOVNTQ(%%mm6, 8(%%ebx))\ | |
588 \ | |
589 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ | |
590 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
591 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ | |
592 MOVNTQ(%%mm5, 16(%%ebx))\ | |
593 \ | |
594 "addl $24, %%ebx \n\t"\ | |
595 \ | |
596 "addl $8, %%eax \n\t"\ | |
597 "cmpl %5, %%eax \n\t"\ | |
598 " jb 1b \n\t" | |
599 | |
600 #define WRITEBGR24MMX2 \ | |
601 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
4248 | 602 "movq "MANGLE(M24A)", %%mm0 \n\t"\ |
603 "movq "MANGLE(M24C)", %%mm7 \n\t"\ | |
2730 | 604 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ |
605 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ | |
606 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ | |
607 \ | |
608 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ | |
609 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ | |
610 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ | |
611 \ | |
612 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ | |
613 "por %%mm1, %%mm6 \n\t"\ | |
614 "por %%mm3, %%mm6 \n\t"\ | |
615 MOVNTQ(%%mm6, (%%ebx))\ | |
616 \ | |
617 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ | |
618 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ | |
619 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ | |
620 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ | |
621 \ | |
4248 | 622 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ |
2730 | 623 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ |
624 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ | |
625 \ | |
626 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ | |
627 "por %%mm3, %%mm6 \n\t"\ | |
628 MOVNTQ(%%mm6, 8(%%ebx))\ | |
629 \ | |
630 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ | |
631 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ | |
632 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ | |
633 \ | |
634 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ | |
635 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ | |
4248 | 636 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ |
2730 | 637 \ |
638 "por %%mm1, %%mm3 \n\t"\ | |
639 "por %%mm3, %%mm6 \n\t"\ | |
640 MOVNTQ(%%mm6, 16(%%ebx))\ | |
641 \ | |
642 "addl $24, %%ebx \n\t"\ | |
643 \ | |
644 "addl $8, %%eax \n\t"\ | |
645 "cmpl %5, %%eax \n\t"\ | |
646 " jb 1b \n\t" | |
647 | |
648 #ifdef HAVE_MMX2 | |
3126 | 649 #undef WRITEBGR24 |
2730 | 650 #define WRITEBGR24 WRITEBGR24MMX2 |
651 #else | |
3126 | 652 #undef WRITEBGR24 |
2730 | 653 #define WRITEBGR24 WRITEBGR24MMX |
654 #endif | |
655 | |
3344 | 656 static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
657 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
658 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, | |
659 int16_t * lumMmxFilter, int16_t * chrMmxFilter) | |
2519 | 660 { |
3344 | 661 #ifdef HAVE_MMX |
662 if(uDest != NULL) | |
663 { | |
664 asm volatile( | |
665 YSCALEYUV2YV12X(0) | |
666 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize), | |
667 "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1) | |
668 : "%eax", "%edx", "%esi" | |
669 ); | |
2519 | 670 |
3344 | 671 asm volatile( |
672 YSCALEYUV2YV12X(4096) | |
673 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize), | |
674 "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1) | |
675 : "%eax", "%edx", "%esi" | |
676 ); | |
677 } | |
2521 | 678 |
3344 | 679 asm volatile( |
680 YSCALEYUV2YV12X(0) | |
681 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize), | |
682 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW) | |
683 : "%eax", "%edx", "%esi" | |
684 ); | |
685 #else | |
3352 | 686 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, |
687 chrFilter, chrSrc, chrFilterSize, | |
688 dest, uDest, vDest, dstW); | |
3344 | 689 #endif |
690 } | |
691 | |
692 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc, | |
693 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW) | |
694 { | |
695 #ifdef HAVE_MMX | |
696 if(uDest != NULL) | |
697 { | |
698 asm volatile( | |
699 YSCALEYUV2YV121 | |
700 :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)), | |
701 "g" (-(dstW>>1)) | |
702 : "%eax" | |
703 ); | |
704 | |
705 asm volatile( | |
706 YSCALEYUV2YV121 | |
707 :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)), | |
708 "g" (-(dstW>>1)) | |
709 : "%eax" | |
710 ); | |
2519 | 711 } |
3344 | 712 |
713 asm volatile( | |
714 YSCALEYUV2YV121 | |
715 :: "r" (lumSrc + dstW), "r" (dest + dstW), | |
716 "g" (-dstW) | |
717 : "%eax" | |
718 ); | |
719 #else | |
720 //FIXME Optimize (just quickly writen not opti..) | |
721 //FIXME replace MINMAX with LUTs | |
722 int i; | |
723 for(i=0; i<dstW; i++) | |
724 { | |
725 int val= lumSrc[i]>>7; | |
726 | |
727 dest[i]= MIN(MAX(val>>19, 0), 255); | |
728 } | |
729 | |
730 if(uDest != NULL) | |
731 for(i=0; i<(dstW>>1); i++) | |
732 { | |
733 int u=chrSrc[i]>>7; | |
734 int v=chrSrc[i + 2048]>>7; | |
735 | |
736 uDest[i]= MIN(MAX(u>>19, 0), 255); | |
737 vDest[i]= MIN(MAX(v>>19, 0), 255); | |
738 } | |
739 #endif | |
2519 | 740 } |
741 | |
3344 | 742 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
743 /** |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
744 * vertical scale YV12 to RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
745 */ |
3344 | 746 static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
747 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
748 uint8_t *dest, int dstW, int dstFormat, int16_t * lumMmxFilter, int16_t * chrMmxFilter) |
3344 | 749 { |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
750 /* if(flags&SWS_FULL_UV_IPOL) |
3344 | 751 { |
752 //FIXME | |
753 }//FULL_UV_IPOL | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
754 else*/ |
3344 | 755 { |
756 #ifdef HAVE_MMX | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
757 if(dstFormat == IMGFMT_BGR32) //FIXME untested |
3344 | 758 { |
759 asm volatile( | |
760 YSCALEYUV2RGBX | |
761 WRITEBGR32 | |
762 | |
763 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
764 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
765 "r" (dest), "m" (dstW), | |
766 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
767 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
768 ); | |
769 } | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
770 else if(dstFormat == IMGFMT_BGR24) //FIXME untested |
3344 | 771 { |
772 asm volatile( | |
773 YSCALEYUV2RGBX | |
774 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize | |
775 "addl %4, %%ebx \n\t" | |
776 WRITEBGR24 | |
777 | |
778 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
779 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
780 "r" (dest), "m" (dstW), | |
781 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
782 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
783 ); | |
784 } | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
785 else if(dstFormat==IMGFMT_BGR15) |
3344 | 786 { |
787 asm volatile( | |
788 YSCALEYUV2RGBX | |
789 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
790 #ifdef DITHER1XBPP | |
4248 | 791 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
792 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
793 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
3344 | 794 #endif |
795 | |
796 WRITEBGR15 | |
797 | |
798 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
799 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
800 "r" (dest), "m" (dstW), | |
801 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
802 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
803 ); | |
804 } | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
805 else if(dstFormat==IMGFMT_BGR16) |
3344 | 806 { |
807 asm volatile( | |
808 YSCALEYUV2RGBX | |
809 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
810 #ifdef DITHER1XBPP | |
4248 | 811 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
812 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
813 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
3344 | 814 #endif |
815 | |
816 WRITEBGR16 | |
817 | |
818 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
819 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
820 "r" (dest), "m" (dstW), | |
821 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
822 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
823 ); | |
824 } | |
825 #else | |
3352 | 826 yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize, |
827 chrFilter, chrSrc, chrFilterSize, | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
828 dest, dstW, dstFormat); |
3344 | 829 |
830 #endif | |
831 } //!FULL_UV_IPOL | |
832 } | |
833 | |
834 | |
835 /** | |
836 * vertical bilinear scale YV12 to RGB | |
837 */ | |
838 static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
839 uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstFormat, int flags) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
840 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
841 int yalpha1=yalpha^4095; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
842 int uvalpha1=uvalpha^4095; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
843 |
4467 | 844 if(flags&SWS_FULL_CHR_H_INT) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
845 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
846 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
847 #ifdef HAVE_MMX |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
848 if(dstFormat==IMGFMT_BGR32) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
849 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
850 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
851 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
852 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
853 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
854 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
855 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
856 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
857 "movq %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
858 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
859 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
860 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
861 MOVNTQ(%%mm3, (%4, %%eax, 4)) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
862 MOVNTQ(%%mm1, 8(%4, %%eax, 4)) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
863 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
864 "addl $4, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
865 "cmpl %5, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
866 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
867 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
868 |
3209 | 869 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
870 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
871 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
872 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
873 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
874 else if(dstFormat==IMGFMT_BGR24) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
875 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
876 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
877 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
878 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
879 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
880 // lsb ... msb |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
881 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
882 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
883 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
884 "movq %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
885 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
886 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
887 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
888 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
889 "psrlq $8, %%mm3 \n\t" // GR0BGR00 |
4248 | 890 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000 |
891 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00 | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
892 "por %%mm2, %%mm3 \n\t" // BGRBGR00 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
893 "movq %%mm1, %%mm2 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
894 "psllq $48, %%mm1 \n\t" // 000000BG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
895 "por %%mm1, %%mm3 \n\t" // BGRBGRBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
896 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
897 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
898 "psrld $16, %%mm2 \n\t" // R000R000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
899 "psrlq $24, %%mm1 \n\t" // 0BGR0000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
900 "por %%mm2, %%mm1 \n\t" // RBGRR000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
901 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
902 "movl %4, %%ebx \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
903 "addl %%eax, %%ebx \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
904 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
905 #ifdef HAVE_MMX2 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
906 //FIXME Alignment |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
907 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
908 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
909 #else |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
910 "movd %%mm3, (%%ebx, %%eax, 2) \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
911 "psrlq $32, %%mm3 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
912 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
913 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
914 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
915 "addl $4, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
916 "cmpl %5, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
917 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
918 |
3209 | 919 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
920 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
921 : "%eax", "%ebx" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
922 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
923 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
924 else if(dstFormat==IMGFMT_BGR15) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
925 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
926 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
927 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
928 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
929 #ifdef DITHER1XBPP |
4248 | 930 "paddusb "MANGLE(g5Dither)", %%mm1\n\t" |
931 "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
932 "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
933 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
934 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
935 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
936 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
937 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
938 "psrlw $3, %%mm3 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
939 "psllw $2, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
940 "psllw $7, %%mm0 \n\t" |
4248 | 941 "pand "MANGLE(g15Mask)", %%mm1 \n\t" |
942 "pand "MANGLE(r15Mask)", %%mm0 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
943 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
944 "por %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
945 "por %%mm1, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
946 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
947 MOVNTQ(%%mm0, (%4, %%eax, 2)) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
948 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
949 "addl $4, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
950 "cmpl %5, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
951 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
952 |
3209 | 953 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
954 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
955 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
956 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
957 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
958 else if(dstFormat==IMGFMT_BGR16) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
959 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
960 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
961 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
962 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
963 #ifdef DITHER1XBPP |
4248 | 964 "paddusb "MANGLE(g6Dither)", %%mm1\n\t" |
965 "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
966 "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
967 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
968 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
969 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
970 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
971 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
972 "psrlw $3, %%mm3 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
973 "psllw $3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
974 "psllw $8, %%mm0 \n\t" |
4248 | 975 "pand "MANGLE(g16Mask)", %%mm1 \n\t" |
976 "pand "MANGLE(r16Mask)", %%mm0 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
977 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
978 "por %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
979 "por %%mm1, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
980 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
981 MOVNTQ(%%mm0, (%4, %%eax, 2)) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
982 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
983 "addl $4, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
984 "cmpl %5, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
985 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
986 |
3209 | 987 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
988 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
989 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
990 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
991 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
992 #else |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
993 if(dstFormat==IMGFMT_BGR32) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
994 { |
2671 | 995 int i; |
3209 | 996 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
997 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
998 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
999 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1000 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
2503 | 1001 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; |
1002 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1003 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1004 dest+= 4; |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1005 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1006 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1007 else if(dstFormat==IMGFMT_BGR24) |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1008 { |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1009 int i; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1010 for(i=0;i<dstW;i++){ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1011 // vertical linear interpolation && yuv2rgb in a single step: |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1012 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1013 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1014 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1015 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1016 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1017 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1018 dest+= 3; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1019 } |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1020 } |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1021 else if(dstFormat==IMGFMT_BGR16) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1022 { |
2671 | 1023 int i; |
3209 | 1024 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1025 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1026 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1027 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1028 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1029 |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1030 ((uint16_t*)dest)[i] = |
2584 | 1031 clip_table16b[(Y + yuvtab_40cf[U]) >>13] | |
1032 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1033 clip_table16r[(Y + yuvtab_3343[V]) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1034 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1035 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1036 else if(dstFormat==IMGFMT_BGR15) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1037 { |
2671 | 1038 int i; |
3209 | 1039 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1040 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1041 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1042 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1043 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1044 |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1045 ((uint16_t*)dest)[i] = |
2584 | 1046 clip_table15b[(Y + yuvtab_40cf[U]) >>13] | |
1047 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1048 clip_table15r[(Y + yuvtab_3343[V]) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1049 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1050 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1051 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1052 }//FULL_UV_IPOL |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1053 else |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1054 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1055 #ifdef HAVE_MMX |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1056 if(dstFormat==IMGFMT_BGR32) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1057 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1058 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1059 YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1060 WRITEBGR32 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1061 |
3209 | 1062 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1063 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1064 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1065 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1066 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1067 else if(dstFormat==IMGFMT_BGR24) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1068 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1069 asm volatile( |
2728 | 1070 "movl %4, %%ebx \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1071 YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1072 WRITEBGR24 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1073 |
3209 | 1074 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1075 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1076 : "%eax", "%ebx" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1077 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1078 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1079 else if(dstFormat==IMGFMT_BGR15) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1080 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1081 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1082 YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1083 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1084 #ifdef DITHER1XBPP |
4248 | 1085 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1086 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1087 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1088 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1089 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1090 WRITEBGR15 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1091 |
3209 | 1092 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1093 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1094 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1095 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1096 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1097 else if(dstFormat==IMGFMT_BGR16) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1098 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1099 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1100 YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1101 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1102 #ifdef DITHER1XBPP |
4248 | 1103 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1104 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1105 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1106 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1107 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1108 WRITEBGR16 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1109 |
3209 | 1110 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1111 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1112 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1113 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1114 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1115 #else |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1116 if(dstFormat==IMGFMT_BGR32) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1117 { |
2671 | 1118 int i; |
3209 | 1119 for(i=0; i<dstW-1; i+=2){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1120 // vertical linear interpolation && yuv2rgb in a single step: |
2575 | 1121 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
1122 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)]; | |
2585 | 1123 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1124 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2575 | 1125 |
1126 int Cb= yuvtab_40cf[U]; | |
1127 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1128 int Cr= yuvtab_3343[V]; | |
1129 | |
1130 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)]; | |
1131 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)]; | |
1132 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)]; | |
1133 | |
1134 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)]; | |
1135 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)]; | |
1136 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)]; | |
1137 } | |
1138 } | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1139 else if(dstFormat==IMGFMT_BGR24) |
2575 | 1140 { |
2671 | 1141 int i; |
3209 | 1142 for(i=0; i<dstW-1; i+=2){ |
2575 | 1143 // vertical linear interpolation && yuv2rgb in a single step: |
1144 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1145 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)]; | |
2585 | 1146 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1147 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2575 | 1148 |
1149 int Cb= yuvtab_40cf[U]; | |
1150 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1151 int Cr= yuvtab_3343[V]; | |
1152 | |
1153 dest[0]=clip_table[((Y1 + Cb) >>13)]; | |
1154 dest[1]=clip_table[((Y1 + Cg) >>13)]; | |
1155 dest[2]=clip_table[((Y1 + Cr) >>13)]; | |
1156 | |
1157 dest[3]=clip_table[((Y2 + Cb) >>13)]; | |
1158 dest[4]=clip_table[((Y2 + Cg) >>13)]; | |
1159 dest[5]=clip_table[((Y2 + Cr) >>13)]; | |
1160 dest+=6; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1161 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1162 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1163 else if(dstFormat==IMGFMT_BGR16) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1164 { |
2671 | 1165 int i; |
4297
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1166 #ifdef DITHER1XBPP |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1167 static int ditherb1=1<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1168 static int ditherg1=1<<13; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1169 static int ditherr1=2<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1170 static int ditherb2=3<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1171 static int ditherg2=3<<13; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1172 static int ditherr2=0<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1173 |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1174 ditherb1 ^= (1^2)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1175 ditherg1 ^= (1^2)<<13; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1176 ditherr1 ^= (1^2)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1177 ditherb2 ^= (3^0)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1178 ditherg2 ^= (3^0)<<13; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1179 ditherr2 ^= (3^0)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1180 #else |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1181 const int ditherb1=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1182 const int ditherg1=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1183 const int ditherr1=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1184 const int ditherb2=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1185 const int ditherg2=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1186 const int ditherr2=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1187 #endif |
3209 | 1188 for(i=0; i<dstW-1; i+=2){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1189 // vertical linear interpolation && yuv2rgb in a single step: |
2575 | 1190 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
1191 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)]; | |
2585 | 1192 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1193 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1194 |
2575 | 1195 int Cb= yuvtab_40cf[U]; |
1196 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1197 int Cr= yuvtab_3343[V]; | |
1198 | |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1199 ((uint16_t*)dest)[i] = |
4297
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1200 clip_table16b[(Y1 + Cb + ditherb1) >>13] | |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1201 clip_table16g[(Y1 + Cg + ditherg1) >>13] | |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1202 clip_table16r[(Y1 + Cr + ditherr1) >>13]; |
2575 | 1203 |
1204 ((uint16_t*)dest)[i+1] = | |
4297
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1205 clip_table16b[(Y2 + Cb + ditherb2) >>13] | |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1206 clip_table16g[(Y2 + Cg + ditherg2) >>13] | |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1207 clip_table16r[(Y2 + Cr + ditherr2) >>13]; |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1208 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1209 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1210 else if(dstFormat==IMGFMT_BGR15) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1211 { |
2671 | 1212 int i; |
4297
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1213 #ifdef DITHER1XBPP |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1214 static int ditherb1=1<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1215 static int ditherg1=1<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1216 static int ditherr1=2<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1217 static int ditherb2=3<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1218 static int ditherg2=3<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1219 static int ditherr2=0<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1220 |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1221 ditherb1 ^= (1^2)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1222 ditherg1 ^= (1^2)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1223 ditherr1 ^= (1^2)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1224 ditherb2 ^= (3^0)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1225 ditherg2 ^= (3^0)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1226 ditherr2 ^= (3^0)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1227 #else |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1228 const int ditherb1=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1229 const int ditherg1=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1230 const int ditherr1=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1231 const int ditherb2=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1232 const int ditherg2=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1233 const int ditherr2=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1234 #endif |
3209 | 1235 for(i=0; i<dstW-1; i+=2){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1236 // vertical linear interpolation && yuv2rgb in a single step: |
2575 | 1237 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
1238 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)]; | |
2585 | 1239 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1240 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1241 |
2575 | 1242 int Cb= yuvtab_40cf[U]; |
1243 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1244 int Cr= yuvtab_3343[V]; | |
1245 | |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1246 ((uint16_t*)dest)[i] = |
4297
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1247 clip_table15b[(Y1 + Cb + ditherb1) >>13] | |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1248 clip_table15g[(Y1 + Cg + ditherg1) >>13] | |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1249 clip_table15r[(Y1 + Cr + ditherr1) >>13]; |
2584 | 1250 |
2575 | 1251 ((uint16_t*)dest)[i+1] = |
4297
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1252 clip_table15b[(Y2 + Cb + ditherb2) >>13] | |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1253 clip_table15g[(Y2 + Cg + ditherg2) >>13] | |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1254 clip_table15r[(Y2 + Cr + ditherr2) >>13]; |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1255 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1256 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1257 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1258 } //!FULL_UV_IPOL |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1259 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1260 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1261 /** |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1262 * YV12 to RGB without scaling or interpolating |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1263 */ |
3344 | 1264 static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1265 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1266 { |
2671 | 1267 int uvalpha1=uvalpha^4095; |
3344 | 1268 const int yalpha1=0; |
2671 | 1269 |
4467 | 1270 if(flags&SWS_FULL_CHR_H_INT) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1271 { |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1272 RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstFormat, flags); |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1273 return; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1274 } |
2576 | 1275 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1276 #ifdef HAVE_MMX |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1277 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1278 { |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1279 if(dstFormat==IMGFMT_BGR32) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1280 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1281 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1282 YSCALEYUV2RGB1 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1283 WRITEBGR32 |
3344 | 1284 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1285 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1286 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1287 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1288 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1289 else if(dstFormat==IMGFMT_BGR24) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1290 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1291 asm volatile( |
2728 | 1292 "movl %4, %%ebx \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1293 YSCALEYUV2RGB1 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1294 WRITEBGR24 |
3344 | 1295 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1296 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1297 : "%eax", "%ebx" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1298 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1299 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1300 else if(dstFormat==IMGFMT_BGR15) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1301 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1302 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1303 YSCALEYUV2RGB1 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1304 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1305 #ifdef DITHER1XBPP |
4248 | 1306 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1307 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1308 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1309 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1310 WRITEBGR15 |
3344 | 1311 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1312 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1313 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1314 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1315 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1316 else if(dstFormat==IMGFMT_BGR16) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1317 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1318 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1319 YSCALEYUV2RGB1 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1320 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1321 #ifdef DITHER1XBPP |
4248 | 1322 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1323 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1324 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1325 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1326 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1327 WRITEBGR16 |
3344 | 1328 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1329 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1330 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1331 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1332 } |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1333 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1334 else |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1335 { |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1336 if(dstFormat==IMGFMT_BGR32) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1337 { |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1338 asm volatile( |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1339 YSCALEYUV2RGB1b |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1340 WRITEBGR32 |
3344 | 1341 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1342 "m" (yalpha1), "m" (uvalpha1) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1343 : "%eax" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1344 ); |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1345 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1346 else if(dstFormat==IMGFMT_BGR24) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1347 { |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1348 asm volatile( |
2728 | 1349 "movl %4, %%ebx \n\t" |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1350 YSCALEYUV2RGB1b |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1351 WRITEBGR24 |
3344 | 1352 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1353 "m" (yalpha1), "m" (uvalpha1) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1354 : "%eax", "%ebx" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1355 ); |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1356 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1357 else if(dstFormat==IMGFMT_BGR15) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1358 { |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1359 asm volatile( |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1360 YSCALEYUV2RGB1b |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1361 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1362 #ifdef DITHER1XBPP |
4248 | 1363 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1364 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1365 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1366 #endif |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1367 WRITEBGR15 |
3344 | 1368 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1369 "m" (yalpha1), "m" (uvalpha1) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1370 : "%eax" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1371 ); |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1372 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1373 else if(dstFormat==IMGFMT_BGR16) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1374 { |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1375 asm volatile( |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1376 YSCALEYUV2RGB1b |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1377 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1378 #ifdef DITHER1XBPP |
4248 | 1379 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1380 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1381 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1382 #endif |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1383 |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1384 WRITEBGR16 |
3344 | 1385 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1386 "m" (yalpha1), "m" (uvalpha1) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1387 : "%eax" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1388 ); |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1389 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1390 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1391 #else |
2576 | 1392 //FIXME write 2 versions (for even & odd lines) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1393 |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1394 if(dstFormat==IMGFMT_BGR32) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1395 { |
2671 | 1396 int i; |
3209 | 1397 for(i=0; i<dstW-1; i+=2){ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1398 // vertical linear interpolation && yuv2rgb in a single step: |
2576 | 1399 int Y1=yuvtab_2568[buf0[i]>>7]; |
1400 int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
2585 | 1401 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1402 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2576 | 1403 |
1404 int Cb= yuvtab_40cf[U]; | |
1405 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1406 int Cr= yuvtab_3343[V]; | |
1407 | |
1408 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)]; | |
1409 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)]; | |
1410 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)]; | |
1411 | |
1412 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)]; | |
1413 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)]; | |
1414 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)]; | |
1415 } | |
1416 } | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1417 else if(dstFormat==IMGFMT_BGR24) |
2576 | 1418 { |
2671 | 1419 int i; |
3209 | 1420 for(i=0; i<dstW-1; i+=2){ |
2576 | 1421 // vertical linear interpolation && yuv2rgb in a single step: |
1422 int Y1=yuvtab_2568[buf0[i]>>7]; | |
1423 int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
2585 | 1424 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1425 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2576 | 1426 |
1427 int Cb= yuvtab_40cf[U]; | |
1428 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1429 int Cr= yuvtab_3343[V]; | |
1430 | |
1431 dest[0]=clip_table[((Y1 + Cb) >>13)]; | |
1432 dest[1]=clip_table[((Y1 + Cg) >>13)]; | |
1433 dest[2]=clip_table[((Y1 + Cr) >>13)]; | |
1434 | |
1435 dest[3]=clip_table[((Y2 + Cb) >>13)]; | |
1436 dest[4]=clip_table[((Y2 + Cg) >>13)]; | |
1437 dest[5]=clip_table[((Y2 + Cr) >>13)]; | |
1438 dest+=6; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1439 } |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1440 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1441 else if(dstFormat==IMGFMT_BGR16) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1442 { |
2671 | 1443 int i; |
4297
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1444 #ifdef DITHER1XBPP |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1445 static int ditherb1=1<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1446 static int ditherg1=1<<13; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1447 static int ditherr1=2<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1448 static int ditherb2=3<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1449 static int ditherg2=3<<13; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1450 static int ditherr2=0<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1451 |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1452 ditherb1 ^= (1^2)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1453 ditherg1 ^= (1^2)<<13; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1454 ditherr1 ^= (1^2)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1455 ditherb2 ^= (3^0)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1456 ditherg2 ^= (3^0)<<13; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1457 ditherr2 ^= (3^0)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1458 #else |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1459 const int ditherb1=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1460 const int ditherg1=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1461 const int ditherr1=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1462 const int ditherb2=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1463 const int ditherg2=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1464 const int ditherr2=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1465 #endif |
3209 | 1466 for(i=0; i<dstW-1; i+=2){ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1467 // vertical linear interpolation && yuv2rgb in a single step: |
2576 | 1468 int Y1=yuvtab_2568[buf0[i]>>7]; |
1469 int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
2585 | 1470 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1471 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1472 |
2576 | 1473 int Cb= yuvtab_40cf[U]; |
1474 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1475 int Cr= yuvtab_3343[V]; | |
1476 | |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1477 ((uint16_t*)dest)[i] = |
4297
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1478 clip_table16b[(Y1 + Cb + ditherb1) >>13] | |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1479 clip_table16g[(Y1 + Cg + ditherg1) >>13] | |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1480 clip_table16r[(Y1 + Cr + ditherr1) >>13]; |
2576 | 1481 |
1482 ((uint16_t*)dest)[i+1] = | |
4297
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1483 clip_table16b[(Y2 + Cb + ditherb2) >>13] | |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1484 clip_table16g[(Y2 + Cg + ditherg2) >>13] | |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1485 clip_table16r[(Y2 + Cr + ditherr2) >>13]; |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1486 } |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1487 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1488 else if(dstFormat==IMGFMT_BGR15) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1489 { |
2671 | 1490 int i; |
4297
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1491 #ifdef DITHER1XBPP |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1492 static int ditherb1=1<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1493 static int ditherg1=1<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1494 static int ditherr1=2<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1495 static int ditherb2=3<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1496 static int ditherg2=3<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1497 static int ditherr2=0<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1498 |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1499 ditherb1 ^= (1^2)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1500 ditherg1 ^= (1^2)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1501 ditherr1 ^= (1^2)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1502 ditherb2 ^= (3^0)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1503 ditherg2 ^= (3^0)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1504 ditherr2 ^= (3^0)<<14; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1505 #else |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1506 const int ditherb1=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1507 const int ditherg1=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1508 const int ditherr1=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1509 const int ditherb2=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1510 const int ditherg2=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1511 const int ditherr2=0; |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1512 #endif |
3209 | 1513 for(i=0; i<dstW-1; i+=2){ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1514 // vertical linear interpolation && yuv2rgb in a single step: |
2576 | 1515 int Y1=yuvtab_2568[buf0[i]>>7]; |
1516 int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
2585 | 1517 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1518 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1519 |
2576 | 1520 int Cb= yuvtab_40cf[U]; |
1521 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1522 int Cr= yuvtab_3343[V]; | |
1523 | |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1524 ((uint16_t*)dest)[i] = |
4297
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1525 clip_table15b[(Y1 + Cb + ditherb1) >>13] | |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1526 clip_table15g[(Y1 + Cg + ditherg1) >>13] | |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1527 clip_table15r[(Y1 + Cr + ditherr1) >>13]; |
2584 | 1528 |
2576 | 1529 ((uint16_t*)dest)[i+1] = |
4297
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1530 clip_table15b[(Y2 + Cb + ditherb2) >>13] | |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1531 clip_table15g[(Y2 + Cg + ditherg2) >>13] | |
29fef3982238
15/16 bit dithering in C (5% slower, can be disabled by comenting #define DITHER1XBPP out)
michael
parents:
4295
diff
changeset
|
1532 clip_table15r[(Y2 + Cr + ditherr2) >>13]; |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1533 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1534 } |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1535 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1536 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1537 |
4481 | 1538 //FIXME yuy2* can read upto 7 samples to much |
1539 | |
4467 | 1540 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width) |
1541 { | |
4481 | 1542 #ifdef HAVE_MMX |
1543 asm volatile( | |
1544 "movq "MANGLE(bm01010101)", %%mm2\n\t" | |
1545 "movl %0, %%eax \n\t" | |
1546 "1: \n\t" | |
1547 "movq (%1, %%eax,2), %%mm0 \n\t" | |
1548 "movq 8(%1, %%eax,2), %%mm1 \n\t" | |
1549 "pand %%mm2, %%mm0 \n\t" | |
1550 "pand %%mm2, %%mm1 \n\t" | |
1551 "packuswb %%mm1, %%mm0 \n\t" | |
1552 "movq %%mm0, (%2, %%eax) \n\t" | |
1553 "addl $8, %%eax \n\t" | |
1554 " js 1b \n\t" | |
1555 : : "g" (-width), "r" (src+width*2), "r" (dst+width) | |
1556 : "%eax" | |
1557 ); | |
4467 | 1558 #else |
1559 int i; | |
1560 for(i=0; i<width; i++) | |
1561 dst[i]= src[2*i]; | |
1562 #endif | |
1563 } | |
1564 | |
1565 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1566 { | |
4481 | 1567 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1568 asm volatile( | |
1569 "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
1570 "movl %0, %%eax \n\t" | |
1571 "1: \n\t" | |
1572 "movq (%1, %%eax,4), %%mm0 \n\t" | |
1573 "movq 8(%1, %%eax,4), %%mm1 \n\t" | |
1574 "movq (%2, %%eax,4), %%mm2 \n\t" | |
1575 "movq 8(%2, %%eax,4), %%mm3 \n\t" | |
1576 PAVGB(%%mm2, %%mm0) | |
1577 PAVGB(%%mm3, %%mm1) | |
1578 "psrlw $8, %%mm0 \n\t" | |
1579 "psrlw $8, %%mm1 \n\t" | |
1580 "packuswb %%mm1, %%mm0 \n\t" | |
1581 "movq %%mm0, %%mm1 \n\t" | |
1582 "psrlw $8, %%mm0 \n\t" | |
1583 "pand %%mm4, %%mm1 \n\t" | |
1584 "packuswb %%mm0, %%mm0 \n\t" | |
1585 "packuswb %%mm1, %%mm1 \n\t" | |
1586 "movd %%mm0, (%4, %%eax) \n\t" | |
1587 "movd %%mm1, (%3, %%eax) \n\t" | |
1588 "addl $4, %%eax \n\t" | |
1589 " js 1b \n\t" | |
1590 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width) | |
1591 : "%eax" | |
1592 ); | |
4467 | 1593 #else |
1594 int i; | |
1595 for(i=0; i<width; i++) | |
1596 { | |
1597 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1; | |
1598 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1; | |
1599 } | |
1600 #endif | |
1601 } | |
1602 | |
1603 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width) | |
1604 { | |
1605 #ifdef HAVE_MMXFIXME | |
1606 #else | |
1607 int i; | |
1608 for(i=0; i<width; i++) | |
1609 { | |
1610 int b= src[i*4+0]; | |
1611 int g= src[i*4+1]; | |
1612 int r= src[i*4+2]; | |
1613 | |
1614 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; | |
1615 } | |
1616 #endif | |
1617 } | |
1618 | |
1619 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1620 { | |
1621 #ifdef HAVE_MMXFIXME | |
1622 #else | |
1623 int i; | |
1624 for(i=0; i<width; i++) | |
1625 { | |
1626 int b= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4]; | |
1627 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5]; | |
1628 int r= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6]; | |
1629 | |
1630 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1631 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1632 } | |
1633 #endif | |
1634 } | |
1635 | |
1636 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width) | |
1637 { | |
4612 | 1638 #ifdef HAVE_MMX |
1639 asm volatile( | |
1640 "movl %2, %%eax \n\t" | |
1641 "movq bgr2YCoeff, %%mm6 \n\t" | |
1642 "movq w1111, %%mm5 \n\t" | |
1643 "pxor %%mm7, %%mm7 \n\t" | |
1644 "leal (%%eax, %%eax, 2), %%ebx \n\t" | |
1645 ".balign 16 \n\t" | |
1646 "1: \n\t" | |
1647 PREFETCH" 64(%0, %%ebx) \n\t" | |
1648 "movd (%0, %%ebx), %%mm0 \n\t" | |
1649 "movd 3(%0, %%ebx), %%mm1 \n\t" | |
1650 "punpcklbw %%mm7, %%mm0 \n\t" | |
1651 "punpcklbw %%mm7, %%mm1 \n\t" | |
1652 "movd 6(%0, %%ebx), %%mm2 \n\t" | |
1653 "movd 9(%0, %%ebx), %%mm3 \n\t" | |
1654 "punpcklbw %%mm7, %%mm2 \n\t" | |
1655 "punpcklbw %%mm7, %%mm3 \n\t" | |
1656 "pmaddwd %%mm6, %%mm0 \n\t" | |
1657 "pmaddwd %%mm6, %%mm1 \n\t" | |
1658 "pmaddwd %%mm6, %%mm2 \n\t" | |
1659 "pmaddwd %%mm6, %%mm3 \n\t" | |
1660 #ifndef FAST_BGR2YV12 | |
1661 "psrad $8, %%mm0 \n\t" | |
1662 "psrad $8, %%mm1 \n\t" | |
1663 "psrad $8, %%mm2 \n\t" | |
1664 "psrad $8, %%mm3 \n\t" | |
1665 #endif | |
1666 "packssdw %%mm1, %%mm0 \n\t" | |
1667 "packssdw %%mm3, %%mm2 \n\t" | |
1668 "pmaddwd %%mm5, %%mm0 \n\t" | |
1669 "pmaddwd %%mm5, %%mm2 \n\t" | |
1670 "packssdw %%mm2, %%mm0 \n\t" | |
1671 "psraw $7, %%mm0 \n\t" | |
1672 | |
1673 "movd 12(%0, %%ebx), %%mm4 \n\t" | |
1674 "movd 15(%0, %%ebx), %%mm1 \n\t" | |
1675 "punpcklbw %%mm7, %%mm4 \n\t" | |
1676 "punpcklbw %%mm7, %%mm1 \n\t" | |
1677 "movd 18(%0, %%ebx), %%mm2 \n\t" | |
1678 "movd 21(%0, %%ebx), %%mm3 \n\t" | |
1679 "punpcklbw %%mm7, %%mm2 \n\t" | |
1680 "punpcklbw %%mm7, %%mm3 \n\t" | |
1681 "pmaddwd %%mm6, %%mm4 \n\t" | |
1682 "pmaddwd %%mm6, %%mm1 \n\t" | |
1683 "pmaddwd %%mm6, %%mm2 \n\t" | |
1684 "pmaddwd %%mm6, %%mm3 \n\t" | |
1685 #ifndef FAST_BGR2YV12 | |
1686 "psrad $8, %%mm4 \n\t" | |
1687 "psrad $8, %%mm1 \n\t" | |
1688 "psrad $8, %%mm2 \n\t" | |
1689 "psrad $8, %%mm3 \n\t" | |
1690 #endif | |
1691 "packssdw %%mm1, %%mm4 \n\t" | |
1692 "packssdw %%mm3, %%mm2 \n\t" | |
1693 "pmaddwd %%mm5, %%mm4 \n\t" | |
1694 "pmaddwd %%mm5, %%mm2 \n\t" | |
1695 "addl $24, %%ebx \n\t" | |
1696 "packssdw %%mm2, %%mm4 \n\t" | |
1697 "psraw $7, %%mm4 \n\t" | |
1698 | |
1699 "packuswb %%mm4, %%mm0 \n\t" | |
1700 "paddusb bgr2YOffset, %%mm0 \n\t" | |
1701 | |
4619 | 1702 "movq %%mm0, (%1, %%eax) \n\t" |
4612 | 1703 "addl $8, %%eax \n\t" |
1704 " js 1b \n\t" | |
1705 : : "r" (src+width*3), "r" (dst+width), "g" (-width) | |
1706 : "%eax", "%ebx" | |
1707 ); | |
4467 | 1708 #else |
1709 int i; | |
1710 for(i=0; i<width; i++) | |
1711 { | |
1712 int b= src[i*3+0]; | |
1713 int g= src[i*3+1]; | |
1714 int r= src[i*3+2]; | |
1715 | |
1716 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; | |
1717 } | |
1718 #endif | |
1719 } | |
1720 | |
1721 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1722 { | |
4619 | 1723 #ifdef HAVE_MMX |
1724 asm volatile( | |
1725 "movl %4, %%eax \n\t" | |
1726 "movq w1111, %%mm5 \n\t" | |
1727 "movq bgr2UCoeff, %%mm6 \n\t" | |
1728 "pxor %%mm7, %%mm7 \n\t" | |
1729 "leal (%%eax, %%eax, 2), %%ebx \n\t" | |
1730 "addl %%ebx, %%ebx \n\t" | |
1731 ".balign 16 \n\t" | |
1732 "1: \n\t" | |
1733 PREFETCH" 64(%0, %%ebx) \n\t" | |
1734 PREFETCH" 64(%1, %%ebx) \n\t" | |
1735 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1736 "movq (%0, %%ebx), %%mm0 \n\t" | |
1737 "movq (%1, %%ebx), %%mm1 \n\t" | |
1738 "movq 6(%0, %%ebx), %%mm2 \n\t" | |
1739 "movq 6(%1, %%ebx), %%mm3 \n\t" | |
1740 PAVGB(%%mm1, %%mm0) | |
1741 PAVGB(%%mm3, %%mm2) | |
1742 "movq %%mm0, %%mm1 \n\t" | |
1743 "movq %%mm2, %%mm3 \n\t" | |
1744 "psrlq $24, %%mm0 \n\t" | |
1745 "psrlq $24, %%mm2 \n\t" | |
1746 PAVGB(%%mm1, %%mm0) | |
1747 PAVGB(%%mm3, %%mm2) | |
1748 "punpcklbw %%mm7, %%mm0 \n\t" | |
1749 "punpcklbw %%mm7, %%mm2 \n\t" | |
1750 #else | |
1751 "movd (%0, %%ebx), %%mm0 \n\t" | |
1752 "movd (%1, %%ebx), %%mm1 \n\t" | |
1753 "movd 3(%0, %%ebx), %%mm2 \n\t" | |
1754 "movd 3(%1, %%ebx), %%mm3 \n\t" | |
1755 "punpcklbw %%mm7, %%mm0 \n\t" | |
1756 "punpcklbw %%mm7, %%mm1 \n\t" | |
1757 "punpcklbw %%mm7, %%mm2 \n\t" | |
1758 "punpcklbw %%mm7, %%mm3 \n\t" | |
1759 "paddw %%mm1, %%mm0 \n\t" | |
1760 "paddw %%mm3, %%mm2 \n\t" | |
1761 "paddw %%mm2, %%mm0 \n\t" | |
1762 "movd 6(%0, %%ebx), %%mm4 \n\t" | |
1763 "movd 6(%1, %%ebx), %%mm1 \n\t" | |
1764 "movd 9(%0, %%ebx), %%mm2 \n\t" | |
1765 "movd 9(%1, %%ebx), %%mm3 \n\t" | |
1766 "punpcklbw %%mm7, %%mm4 \n\t" | |
1767 "punpcklbw %%mm7, %%mm1 \n\t" | |
1768 "punpcklbw %%mm7, %%mm2 \n\t" | |
1769 "punpcklbw %%mm7, %%mm3 \n\t" | |
1770 "paddw %%mm1, %%mm4 \n\t" | |
1771 "paddw %%mm3, %%mm2 \n\t" | |
1772 "paddw %%mm4, %%mm2 \n\t" | |
1773 "psrlw $2, %%mm0 \n\t" | |
1774 "psrlw $2, %%mm2 \n\t" | |
1775 #endif | |
1776 "movq bgr2VCoeff, %%mm1 \n\t" | |
1777 "movq bgr2VCoeff, %%mm3 \n\t" | |
1778 | |
1779 "pmaddwd %%mm0, %%mm1 \n\t" | |
1780 "pmaddwd %%mm2, %%mm3 \n\t" | |
1781 "pmaddwd %%mm6, %%mm0 \n\t" | |
1782 "pmaddwd %%mm6, %%mm2 \n\t" | |
1783 #ifndef FAST_BGR2YV12 | |
1784 "psrad $8, %%mm0 \n\t" | |
1785 "psrad $8, %%mm1 \n\t" | |
1786 "psrad $8, %%mm2 \n\t" | |
1787 "psrad $8, %%mm3 \n\t" | |
1788 #endif | |
1789 "packssdw %%mm2, %%mm0 \n\t" | |
1790 "packssdw %%mm3, %%mm1 \n\t" | |
1791 "pmaddwd %%mm5, %%mm0 \n\t" | |
1792 "pmaddwd %%mm5, %%mm1 \n\t" | |
1793 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
1794 "psraw $7, %%mm0 \n\t" | |
1795 | |
1796 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1797 "movq 12(%0, %%ebx), %%mm4 \n\t" | |
1798 "movq 12(%1, %%ebx), %%mm1 \n\t" | |
1799 "movq 18(%0, %%ebx), %%mm2 \n\t" | |
1800 "movq 18(%1, %%ebx), %%mm3 \n\t" | |
1801 PAVGB(%%mm1, %%mm4) | |
1802 PAVGB(%%mm3, %%mm2) | |
1803 "movq %%mm4, %%mm1 \n\t" | |
1804 "movq %%mm2, %%mm3 \n\t" | |
1805 "psrlq $24, %%mm4 \n\t" | |
1806 "psrlq $24, %%mm2 \n\t" | |
1807 PAVGB(%%mm1, %%mm4) | |
1808 PAVGB(%%mm3, %%mm2) | |
1809 "punpcklbw %%mm7, %%mm4 \n\t" | |
1810 "punpcklbw %%mm7, %%mm2 \n\t" | |
1811 #else | |
1812 "movd 12(%0, %%ebx), %%mm4 \n\t" | |
1813 "movd 12(%1, %%ebx), %%mm1 \n\t" | |
1814 "movd 15(%0, %%ebx), %%mm2 \n\t" | |
1815 "movd 15(%1, %%ebx), %%mm3 \n\t" | |
1816 "punpcklbw %%mm7, %%mm4 \n\t" | |
1817 "punpcklbw %%mm7, %%mm1 \n\t" | |
1818 "punpcklbw %%mm7, %%mm2 \n\t" | |
1819 "punpcklbw %%mm7, %%mm3 \n\t" | |
1820 "paddw %%mm1, %%mm4 \n\t" | |
1821 "paddw %%mm3, %%mm2 \n\t" | |
1822 "paddw %%mm2, %%mm4 \n\t" | |
1823 "movd 18(%0, %%ebx), %%mm5 \n\t" | |
1824 "movd 18(%1, %%ebx), %%mm1 \n\t" | |
1825 "movd 21(%0, %%ebx), %%mm2 \n\t" | |
1826 "movd 21(%1, %%ebx), %%mm3 \n\t" | |
1827 "punpcklbw %%mm7, %%mm5 \n\t" | |
1828 "punpcklbw %%mm7, %%mm1 \n\t" | |
1829 "punpcklbw %%mm7, %%mm2 \n\t" | |
1830 "punpcklbw %%mm7, %%mm3 \n\t" | |
1831 "paddw %%mm1, %%mm5 \n\t" | |
1832 "paddw %%mm3, %%mm2 \n\t" | |
1833 "paddw %%mm5, %%mm2 \n\t" | |
1834 "movq w1111, %%mm5 \n\t" | |
1835 "psrlw $2, %%mm4 \n\t" | |
1836 "psrlw $2, %%mm2 \n\t" | |
1837 #endif | |
1838 "movq bgr2VCoeff, %%mm1 \n\t" | |
1839 "movq bgr2VCoeff, %%mm3 \n\t" | |
1840 | |
1841 "pmaddwd %%mm4, %%mm1 \n\t" | |
1842 "pmaddwd %%mm2, %%mm3 \n\t" | |
1843 "pmaddwd %%mm6, %%mm4 \n\t" | |
1844 "pmaddwd %%mm6, %%mm2 \n\t" | |
1845 #ifndef FAST_BGR2YV12 | |
1846 "psrad $8, %%mm4 \n\t" | |
1847 "psrad $8, %%mm1 \n\t" | |
1848 "psrad $8, %%mm2 \n\t" | |
1849 "psrad $8, %%mm3 \n\t" | |
1850 #endif | |
1851 "packssdw %%mm2, %%mm4 \n\t" | |
1852 "packssdw %%mm3, %%mm1 \n\t" | |
1853 "pmaddwd %%mm5, %%mm4 \n\t" | |
1854 "pmaddwd %%mm5, %%mm1 \n\t" | |
1855 "addl $24, %%ebx \n\t" | |
1856 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 | |
1857 "psraw $7, %%mm4 \n\t" | |
1858 | |
1859 "movq %%mm0, %%mm1 \n\t" | |
1860 "punpckldq %%mm4, %%mm0 \n\t" | |
1861 "punpckhdq %%mm4, %%mm1 \n\t" | |
1862 "packsswb %%mm1, %%mm0 \n\t" | |
1863 "paddb bgr2UVOffset, %%mm0 \n\t" | |
1864 | |
1865 "movd %%mm0, (%2, %%eax) \n\t" | |
1866 "punpckhdq %%mm0, %%mm0 \n\t" | |
1867 "movd %%mm0, (%3, %%eax) \n\t" | |
1868 "addl $4, %%eax \n\t" | |
1869 " js 1b \n\t" | |
1870 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width) | |
1871 : "%eax", "%ebx" | |
1872 ); | |
4467 | 1873 #else |
1874 int i; | |
1875 for(i=0; i<width; i++) | |
1876 { | |
1877 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3]; | |
1878 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4]; | |
1879 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5]; | |
1880 | |
1881 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1882 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1883 } | |
1884 #endif | |
1885 } | |
1886 | |
4578 | 1887 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width) |
1888 { | |
1889 int i; | |
1890 for(i=0; i<width; i++) | |
1891 { | |
1892 int d= src[i*2] + (src[i*2+1]<<8); | |
1893 int b= d&0x1F; | |
1894 int g= (d>>5)&0x3F; | |
1895 int r= (d>>11)&0x1F; | |
1896 | |
1897 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16; | |
1898 } | |
1899 } | |
1900 | |
1901 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1902 { | |
1903 int i; | |
1904 for(i=0; i<width; i++) | |
1905 { | |
4579 | 1906 #if 1 |
1907 int d0= le2me_32( ((uint32_t*)src1)[i] ); | |
1908 int d1= le2me_32( ((uint32_t*)src2)[i] ); | |
1909 | |
1910 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F); | |
1911 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F); | |
1912 | |
1913 int dh2= (dh>>11) + (dh<<21); | |
1914 int d= dh2 + dl; | |
1915 | |
1916 int b= d&0x7F; | |
1917 int r= (d>>11)&0x7F; | |
1918 int g= d>>21; | |
1919 #else | |
4578 | 1920 int d0= src1[i*4] + (src1[i*4+1]<<8); |
1921 int b0= d0&0x1F; | |
1922 int g0= (d0>>5)&0x3F; | |
1923 int r0= (d0>>11)&0x1F; | |
1924 | |
1925 int d1= src1[i*4+2] + (src1[i*4+3]<<8); | |
1926 int b1= d1&0x1F; | |
1927 int g1= (d1>>5)&0x3F; | |
1928 int r1= (d1>>11)&0x1F; | |
1929 | |
1930 int d2= src2[i*4] + (src2[i*4+1]<<8); | |
1931 int b2= d2&0x1F; | |
1932 int g2= (d2>>5)&0x3F; | |
1933 int r2= (d2>>11)&0x1F; | |
1934 | |
1935 int d3= src2[i*4+2] + (src2[i*4+3]<<8); | |
1936 int b3= d3&0x1F; | |
1937 int g3= (d3>>5)&0x3F; | |
1938 int r3= (d3>>11)&0x1F; | |
1939 | |
1940 int b= b0 + b1 + b2 + b3; | |
1941 int g= g0 + g1 + g2 + g3; | |
1942 int r= r0 + r1 + r2 + r3; | |
4579 | 1943 #endif |
4578 | 1944 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128; |
1945 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128; | |
1946 } | |
1947 } | |
1948 | |
4580 | 1949 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width) |
1950 { | |
1951 int i; | |
1952 for(i=0; i<width; i++) | |
1953 { | |
1954 int d= src[i*2] + (src[i*2+1]<<8); | |
1955 int b= d&0x1F; | |
1956 int g= (d>>5)&0x1F; | |
1957 int r= (d>>10)&0x1F; | |
1958 | |
1959 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16; | |
1960 } | |
1961 } | |
1962 | |
1963 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1964 { | |
1965 int i; | |
1966 for(i=0; i<width; i++) | |
1967 { | |
1968 #if 1 | |
1969 int d0= le2me_32( ((uint32_t*)src1)[i] ); | |
1970 int d1= le2me_32( ((uint32_t*)src2)[i] ); | |
1971 | |
1972 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F); | |
1973 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F); | |
1974 | |
1975 int dh2= (dh>>11) + (dh<<21); | |
1976 int d= dh2 + dl; | |
1977 | |
1978 int b= d&0x7F; | |
1979 int r= (d>>10)&0x7F; | |
1980 int g= d>>21; | |
1981 #else | |
1982 int d0= src1[i*4] + (src1[i*4+1]<<8); | |
1983 int b0= d0&0x1F; | |
1984 int g0= (d0>>5)&0x1F; | |
1985 int r0= (d0>>10)&0x1F; | |
1986 | |
1987 int d1= src1[i*4+2] + (src1[i*4+3]<<8); | |
1988 int b1= d1&0x1F; | |
1989 int g1= (d1>>5)&0x1F; | |
1990 int r1= (d1>>10)&0x1F; | |
1991 | |
1992 int d2= src2[i*4] + (src2[i*4+1]<<8); | |
1993 int b2= d2&0x1F; | |
1994 int g2= (d2>>5)&0x1F; | |
1995 int r2= (d2>>10)&0x1F; | |
1996 | |
1997 int d3= src2[i*4+2] + (src2[i*4+3]<<8); | |
1998 int b3= d3&0x1F; | |
1999 int g3= (d3>>5)&0x1F; | |
2000 int r3= (d3>>10)&0x1F; | |
2001 | |
2002 int b= b0 + b1 + b2 + b3; | |
2003 int g= g0 + g1 + g2 + g3; | |
2004 int r= r0 + r1 + r2 + r3; | |
2005 #endif | |
2006 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128; | |
2007 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128; | |
2008 } | |
2009 } | |
2010 | |
2011 | |
4558 | 2012 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width) |
2013 { | |
2014 int i; | |
2015 for(i=0; i<width; i++) | |
2016 { | |
2017 int r= src[i*4+0]; | |
2018 int g= src[i*4+1]; | |
2019 int b= src[i*4+2]; | |
2020 | |
2021 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; | |
2022 } | |
2023 } | |
2024 | |
2025 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2026 { | |
2027 int i; | |
2028 for(i=0; i<width; i++) | |
2029 { | |
2030 int r= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4]; | |
2031 int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5]; | |
2032 int b= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6]; | |
2033 | |
2034 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2035 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2036 } | |
2037 } | |
2038 | |
2039 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width) | |
2040 { | |
2041 int i; | |
2042 for(i=0; i<width; i++) | |
2043 { | |
2044 int r= src[i*3+0]; | |
2045 int g= src[i*3+1]; | |
2046 int b= src[i*3+2]; | |
2047 | |
2048 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; | |
2049 } | |
2050 } | |
2051 | |
2052 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2053 { | |
2054 int i; | |
2055 for(i=0; i<width; i++) | |
2056 { | |
2057 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3]; | |
2058 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4]; | |
2059 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5]; | |
2060 | |
2061 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2062 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2063 } | |
2064 } | |
2065 | |
4467 | 2066 |
3272 | 2067 // Bilinear / Bicubic scaling |
2068 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, | |
2069 int16_t *filter, int16_t *filterPos, int filterSize) | |
2070 { | |
2071 #ifdef HAVE_MMX | |
2072 if(filterSize==4) // allways true for upscaling, sometimes for down too | |
2073 { | |
2074 int counter= -2*dstW; | |
2075 filter-= counter*2; | |
2076 filterPos-= counter/2; | |
2077 dst-= counter/2; | |
2078 asm volatile( | |
2079 "pxor %%mm7, %%mm7 \n\t" | |
4248 | 2080 "movq "MANGLE(w02)", %%mm6 \n\t" |
3272 | 2081 "pushl %%ebp \n\t" // we use 7 regs here ... |
2082 "movl %%eax, %%ebp \n\t" | |
2083 ".balign 16 \n\t" | |
2084 "1: \n\t" | |
2085 "movzwl (%2, %%ebp), %%eax \n\t" | |
2086 "movzwl 2(%2, %%ebp), %%ebx \n\t" | |
2087 "movq (%1, %%ebp, 4), %%mm1 \n\t" | |
2088 "movq 8(%1, %%ebp, 4), %%mm3 \n\t" | |
2089 "movd (%3, %%eax), %%mm0 \n\t" | |
2090 "movd (%3, %%ebx), %%mm2 \n\t" | |
2091 "punpcklbw %%mm7, %%mm0 \n\t" | |
2092 "punpcklbw %%mm7, %%mm2 \n\t" | |
2093 "pmaddwd %%mm1, %%mm0 \n\t" | |
2094 "pmaddwd %%mm2, %%mm3 \n\t" | |
2095 "psrad $8, %%mm0 \n\t" | |
2096 "psrad $8, %%mm3 \n\t" | |
2097 "packssdw %%mm3, %%mm0 \n\t" | |
2098 "pmaddwd %%mm6, %%mm0 \n\t" | |
2099 "packssdw %%mm0, %%mm0 \n\t" | |
2100 "movd %%mm0, (%4, %%ebp) \n\t" | |
2101 "addl $4, %%ebp \n\t" | |
2102 " jnc 1b \n\t" | |
3352 | 2103 |
3272 | 2104 "popl %%ebp \n\t" |
2105 : "+a" (counter) | |
2106 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
2107 : "%ebx" | |
2108 ); | |
2109 } | |
2110 else if(filterSize==8) | |
2111 { | |
2112 int counter= -2*dstW; | |
2113 filter-= counter*4; | |
2114 filterPos-= counter/2; | |
2115 dst-= counter/2; | |
2116 asm volatile( | |
2117 "pxor %%mm7, %%mm7 \n\t" | |
4248 | 2118 "movq "MANGLE(w02)", %%mm6 \n\t" |
3272 | 2119 "pushl %%ebp \n\t" // we use 7 regs here ... |
2120 "movl %%eax, %%ebp \n\t" | |
2121 ".balign 16 \n\t" | |
2122 "1: \n\t" | |
2123 "movzwl (%2, %%ebp), %%eax \n\t" | |
2124 "movzwl 2(%2, %%ebp), %%ebx \n\t" | |
2125 "movq (%1, %%ebp, 8), %%mm1 \n\t" | |
2126 "movq 16(%1, %%ebp, 8), %%mm3 \n\t" | |
2127 "movd (%3, %%eax), %%mm0 \n\t" | |
2128 "movd (%3, %%ebx), %%mm2 \n\t" | |
2129 "punpcklbw %%mm7, %%mm0 \n\t" | |
2130 "punpcklbw %%mm7, %%mm2 \n\t" | |
2131 "pmaddwd %%mm1, %%mm0 \n\t" | |
2132 "pmaddwd %%mm2, %%mm3 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
2133 |
3272 | 2134 "movq 8(%1, %%ebp, 8), %%mm1 \n\t" |
2135 "movq 24(%1, %%ebp, 8), %%mm5 \n\t" | |
2136 "movd 4(%3, %%eax), %%mm4 \n\t" | |
2137 "movd 4(%3, %%ebx), %%mm2 \n\t" | |
2138 "punpcklbw %%mm7, %%mm4 \n\t" | |
2139 "punpcklbw %%mm7, %%mm2 \n\t" | |
2140 "pmaddwd %%mm1, %%mm4 \n\t" | |
2141 "pmaddwd %%mm2, %%mm5 \n\t" | |
2142 "paddd %%mm4, %%mm0 \n\t" | |
2143 "paddd %%mm5, %%mm3 \n\t" | |
2144 | |
2145 "psrad $8, %%mm0 \n\t" | |
2146 "psrad $8, %%mm3 \n\t" | |
2147 "packssdw %%mm3, %%mm0 \n\t" | |
2148 "pmaddwd %%mm6, %%mm0 \n\t" | |
2149 "packssdw %%mm0, %%mm0 \n\t" | |
2150 "movd %%mm0, (%4, %%ebp) \n\t" | |
2151 "addl $4, %%ebp \n\t" | |
2152 " jnc 1b \n\t" | |
3344 | 2153 |
3272 | 2154 "popl %%ebp \n\t" |
2155 : "+a" (counter) | |
2156 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
2157 : "%ebx" | |
2158 ); | |
2159 } | |
2160 else | |
2161 { | |
2162 int counter= -2*dstW; | |
2163 // filter-= counter*filterSize/2; | |
2164 filterPos-= counter/2; | |
2165 dst-= counter/2; | |
2166 asm volatile( | |
2167 "pxor %%mm7, %%mm7 \n\t" | |
4248 | 2168 "movq "MANGLE(w02)", %%mm6 \n\t" |
3272 | 2169 ".balign 16 \n\t" |
2170 "1: \n\t" | |
2171 "movl %2, %%ecx \n\t" | |
2172 "movzwl (%%ecx, %0), %%eax \n\t" | |
2173 "movzwl 2(%%ecx, %0), %%ebx \n\t" | |
2174 "movl %5, %%ecx \n\t" | |
2175 "pxor %%mm4, %%mm4 \n\t" | |
2176 "pxor %%mm5, %%mm5 \n\t" | |
2177 "2: \n\t" | |
2178 "movq (%1), %%mm1 \n\t" | |
2179 "movq (%1, %6), %%mm3 \n\t" | |
2180 "movd (%%ecx, %%eax), %%mm0 \n\t" | |
2181 "movd (%%ecx, %%ebx), %%mm2 \n\t" | |
2182 "punpcklbw %%mm7, %%mm0 \n\t" | |
2183 "punpcklbw %%mm7, %%mm2 \n\t" | |
2184 "pmaddwd %%mm1, %%mm0 \n\t" | |
2185 "pmaddwd %%mm2, %%mm3 \n\t" | |
2186 "paddd %%mm3, %%mm5 \n\t" | |
2187 "paddd %%mm0, %%mm4 \n\t" | |
2188 "addl $8, %1 \n\t" | |
2189 "addl $4, %%ecx \n\t" | |
2190 "cmpl %4, %%ecx \n\t" | |
2191 " jb 2b \n\t" | |
2192 "addl %6, %1 \n\t" | |
2193 "psrad $8, %%mm4 \n\t" | |
2194 "psrad $8, %%mm5 \n\t" | |
2195 "packssdw %%mm5, %%mm4 \n\t" | |
2196 "pmaddwd %%mm6, %%mm4 \n\t" | |
2197 "packssdw %%mm4, %%mm4 \n\t" | |
2198 "movl %3, %%eax \n\t" | |
2199 "movd %%mm4, (%%eax, %0) \n\t" | |
2200 "addl $4, %0 \n\t" | |
2201 " jnc 1b \n\t" | |
3344 | 2202 |
3641 | 2203 : "+r" (counter), "+r" (filter) |
2204 : "m" (filterPos), "m" (dst), "m"(src+filterSize), | |
3272 | 2205 "m" (src), "r" (filterSize*2) |
3299 | 2206 : "%ebx", "%eax", "%ecx" |
3272 | 2207 ); |
2208 } | |
2209 #else | |
2210 int i; | |
2211 for(i=0; i<dstW; i++) | |
2212 { | |
2213 int j; | |
2214 int srcPos= filterPos[i]; | |
2215 int val=0; | |
3344 | 2216 // printf("filterPos: %d\n", filterPos[i]); |
3272 | 2217 for(j=0; j<filterSize; j++) |
2218 { | |
2219 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); | |
2220 val += ((int)src[srcPos + j])*filter[filterSize*i + j]; | |
2221 } | |
2222 // filter += hFilterSize; | |
2223 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ... | |
2224 // dst[i] = val>>7; | |
2225 } | |
2226 #endif | |
2227 } | |
2228 // *** horizontal scale Y line to temp buffer | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2229 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc, |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2230 int flags, int canMMX2BeUsed, int16_t *hLumFilter, |
4467 | 2231 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, |
2232 int srcFormat, uint8_t *formatConvBuffer) | |
2469 | 2233 { |
4467 | 2234 if(srcFormat==IMGFMT_YUY2) |
2235 { | |
2236 RENAME(yuy2ToY)(formatConvBuffer, src, srcW); | |
2237 src= formatConvBuffer; | |
2238 } | |
2239 else if(srcFormat==IMGFMT_BGR32) | |
2240 { | |
2241 RENAME(bgr32ToY)(formatConvBuffer, src, srcW); | |
2242 src= formatConvBuffer; | |
2243 } | |
2244 else if(srcFormat==IMGFMT_BGR24) | |
2245 { | |
2246 RENAME(bgr24ToY)(formatConvBuffer, src, srcW); | |
2247 src= formatConvBuffer; | |
2248 } | |
4578 | 2249 else if(srcFormat==IMGFMT_BGR16) |
2250 { | |
2251 RENAME(bgr16ToY)(formatConvBuffer, src, srcW); | |
2252 src= formatConvBuffer; | |
2253 } | |
4580 | 2254 else if(srcFormat==IMGFMT_BGR15) |
2255 { | |
2256 RENAME(bgr15ToY)(formatConvBuffer, src, srcW); | |
2257 src= formatConvBuffer; | |
2258 } | |
4558 | 2259 else if(srcFormat==IMGFMT_RGB32) |
2260 { | |
2261 RENAME(rgb32ToY)(formatConvBuffer, src, srcW); | |
2262 src= formatConvBuffer; | |
2263 } | |
2264 else if(srcFormat==IMGFMT_RGB24) | |
2265 { | |
2266 RENAME(rgb24ToY)(formatConvBuffer, src, srcW); | |
2267 src= formatConvBuffer; | |
2268 } | |
4467 | 2269 |
3352 | 2270 #ifdef HAVE_MMX |
2271 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one) | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2272 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) |
3352 | 2273 #else |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2274 if(!(flags&SWS_FAST_BILINEAR)) |
3352 | 2275 #endif |
3272 | 2276 { |
2277 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); | |
2278 } | |
2279 else // Fast Bilinear upscale / crap downscale | |
2280 { | |
2469 | 2281 #ifdef ARCH_X86 |
2282 #ifdef HAVE_MMX2 | |
2671 | 2283 int i; |
2469 | 2284 if(canMMX2BeUsed) |
2285 { | |
2286 asm volatile( | |
2287 "pxor %%mm7, %%mm7 \n\t" | |
2288 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |
2289 "movd %5, %%mm6 \n\t" // xInc&0xFFFF | |
2290 "punpcklwd %%mm6, %%mm6 \n\t" | |
2291 "punpcklwd %%mm6, %%mm6 \n\t" | |
2292 "movq %%mm6, %%mm2 \n\t" | |
2293 "psllq $16, %%mm2 \n\t" | |
2294 "paddw %%mm6, %%mm2 \n\t" | |
2295 "psllq $16, %%mm2 \n\t" | |
2296 "paddw %%mm6, %%mm2 \n\t" | |
2297 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2298 "movq %%mm2, %%mm4 \n\t" |
2469 | 2299 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF |
2300 "punpcklwd %%mm6, %%mm6 \n\t" | |
2301 "punpcklwd %%mm6, %%mm6 \n\t" | |
2302 "xorl %%eax, %%eax \n\t" // i | |
2303 "movl %0, %%esi \n\t" // src | |
2304 "movl %1, %%edi \n\t" // buf1 | |
2305 "movl %3, %%edx \n\t" // (xInc*4)>>16 | |
2306 "xorl %%ecx, %%ecx \n\t" | |
2307 "xorl %%ebx, %%ebx \n\t" | |
2308 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF | |
2520 | 2309 |
2469 | 2310 #define FUNNY_Y_CODE \ |
2520 | 2311 PREFETCH" 1024(%%esi) \n\t"\ |
2312 PREFETCH" 1056(%%esi) \n\t"\ | |
2313 PREFETCH" 1088(%%esi) \n\t"\ | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2314 "call *%6 \n\t"\ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2315 "movq %%mm4, %%mm2 \n\t"\ |
2469 | 2316 "xorl %%ecx, %%ecx \n\t" |
2520 | 2317 |
2469 | 2318 FUNNY_Y_CODE |
2319 FUNNY_Y_CODE | |
2320 FUNNY_Y_CODE | |
2321 FUNNY_Y_CODE | |
2322 FUNNY_Y_CODE | |
2323 FUNNY_Y_CODE | |
2324 FUNNY_Y_CODE | |
2325 FUNNY_Y_CODE | |
2326 | |
2327 :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16), | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2328 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (funnyYCode) |
2469 | 2329 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" |
2330 ); | |
3215 | 2331 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; |
2469 | 2332 } |
2333 else | |
2334 { | |
2335 #endif | |
2336 //NO MMX just normal asm ... | |
2337 asm volatile( | |
2338 "xorl %%eax, %%eax \n\t" // i | |
2339 "xorl %%ebx, %%ebx \n\t" // xx | |
2340 "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
2341 ".balign 16 \n\t" |
2469 | 2342 "1: \n\t" |
2343 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |
2344 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |
2345 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2346 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2347 "shll $16, %%edi \n\t" | |
2348 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2349 "movl %1, %%edi \n\t" | |
2350 "shrl $9, %%esi \n\t" | |
2351 "movw %%si, (%%edi, %%eax, 2) \n\t" | |
2352 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
2353 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry | |
2354 | |
2355 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |
2356 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |
2357 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2358 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2359 "shll $16, %%edi \n\t" | |
2360 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2361 "movl %1, %%edi \n\t" | |
2362 "shrl $9, %%esi \n\t" | |
2363 "movw %%si, 2(%%edi, %%eax, 2) \n\t" | |
2364 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
2365 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry | |
2366 | |
2367 | |
2368 "addl $2, %%eax \n\t" | |
2369 "cmpl %2, %%eax \n\t" | |
2370 " jb 1b \n\t" | |
2371 | |
2372 | |
2373 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF) | |
2374 : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |
2375 ); | |
2376 #ifdef HAVE_MMX2 | |
2377 } //if MMX2 cant be used | |
2378 #endif | |
2379 #else | |
2671 | 2380 int i; |
2381 unsigned int xpos=0; | |
2382 for(i=0;i<dstWidth;i++) | |
2383 { | |
2384 register unsigned int xx=xpos>>16; | |
2385 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2386 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; | |
2387 xpos+=xInc; | |
2388 } | |
2469 | 2389 #endif |
3272 | 2390 } |
2469 | 2391 } |
2392 | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2393 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2, |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2394 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, |
4467 | 2395 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, |
2396 int srcFormat, uint8_t *formatConvBuffer) | |
2469 | 2397 { |
4467 | 2398 if(srcFormat==IMGFMT_YUY2) |
2399 { | |
2400 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2401 src1= formatConvBuffer; | |
2402 src2= formatConvBuffer+2048; | |
2403 } | |
2404 else if(srcFormat==IMGFMT_BGR32) | |
2405 { | |
2406 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2407 src1= formatConvBuffer; | |
2408 src2= formatConvBuffer+2048; | |
2409 } | |
2410 else if(srcFormat==IMGFMT_BGR24) | |
2411 { | |
2412 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2413 src1= formatConvBuffer; | |
2414 src2= formatConvBuffer+2048; | |
2415 } | |
4578 | 2416 else if(srcFormat==IMGFMT_BGR16) |
2417 { | |
2418 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2419 src1= formatConvBuffer; | |
2420 src2= formatConvBuffer+2048; | |
2421 } | |
4580 | 2422 else if(srcFormat==IMGFMT_BGR15) |
2423 { | |
2424 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2425 src1= formatConvBuffer; | |
2426 src2= formatConvBuffer+2048; | |
2427 } | |
4558 | 2428 else if(srcFormat==IMGFMT_RGB32) |
2429 { | |
2430 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2431 src1= formatConvBuffer; | |
2432 src2= formatConvBuffer+2048; | |
2433 } | |
2434 else if(srcFormat==IMGFMT_RGB24) | |
2435 { | |
2436 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2437 src1= formatConvBuffer; | |
2438 src2= formatConvBuffer+2048; | |
2439 } | |
4481 | 2440 else if(isGray(srcFormat)) |
2441 { | |
2442 return; | |
2443 } | |
4467 | 2444 |
3352 | 2445 #ifdef HAVE_MMX |
2446 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one) | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2447 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) |
3352 | 2448 #else |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2449 if(!(flags&SWS_FAST_BILINEAR)) |
3352 | 2450 #endif |
3272 | 2451 { |
2452 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2453 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2454 } | |
2455 else // Fast Bilinear upscale / crap downscale | |
2456 { | |
2469 | 2457 #ifdef ARCH_X86 |
2458 #ifdef HAVE_MMX2 | |
2671 | 2459 int i; |
2469 | 2460 if(canMMX2BeUsed) |
2461 { | |
2462 asm volatile( | |
2463 "pxor %%mm7, %%mm7 \n\t" | |
2464 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |
2465 "movd %5, %%mm6 \n\t" // xInc&0xFFFF | |
2466 "punpcklwd %%mm6, %%mm6 \n\t" | |
2467 "punpcklwd %%mm6, %%mm6 \n\t" | |
2468 "movq %%mm6, %%mm2 \n\t" | |
2469 "psllq $16, %%mm2 \n\t" | |
2470 "paddw %%mm6, %%mm2 \n\t" | |
2471 "psllq $16, %%mm2 \n\t" | |
2472 "paddw %%mm6, %%mm2 \n\t" | |
2473 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2474 "movq %%mm2, %%mm4 \n\t" |
2469 | 2475 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF |
2476 "punpcklwd %%mm6, %%mm6 \n\t" | |
2477 "punpcklwd %%mm6, %%mm6 \n\t" | |
2478 "xorl %%eax, %%eax \n\t" // i | |
2479 "movl %0, %%esi \n\t" // src | |
2480 "movl %1, %%edi \n\t" // buf1 | |
2481 "movl %3, %%edx \n\t" // (xInc*4)>>16 | |
2482 "xorl %%ecx, %%ecx \n\t" | |
2483 "xorl %%ebx, %%ebx \n\t" | |
2484 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF | |
2485 | |
2486 #define FUNNYUVCODE \ | |
2520 | 2487 PREFETCH" 1024(%%esi) \n\t"\ |
2488 PREFETCH" 1056(%%esi) \n\t"\ | |
2489 PREFETCH" 1088(%%esi) \n\t"\ | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2490 "call *%7 \n\t"\ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2491 "movq %%mm4, %%mm2 \n\t"\ |
2469 | 2492 "xorl %%ecx, %%ecx \n\t" |
2493 | |
2494 FUNNYUVCODE | |
2495 FUNNYUVCODE | |
2496 FUNNYUVCODE | |
2497 FUNNYUVCODE | |
2498 | |
2499 FUNNYUVCODE | |
2500 FUNNYUVCODE | |
2501 FUNNYUVCODE | |
2502 FUNNYUVCODE | |
2503 "xorl %%eax, %%eax \n\t" // i | |
2504 "movl %6, %%esi \n\t" // src | |
2505 "movl %1, %%edi \n\t" // buf1 | |
2506 "addl $4096, %%edi \n\t" | |
2507 | |
2508 FUNNYUVCODE | |
2509 FUNNYUVCODE | |
2510 FUNNYUVCODE | |
2511 FUNNYUVCODE | |
2512 | |
2513 FUNNYUVCODE | |
2514 FUNNYUVCODE | |
2515 FUNNYUVCODE | |
2516 FUNNYUVCODE | |
2517 | |
2518 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16), | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2519 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2), "m" (funnyUVCode) |
2469 | 2520 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" |
2521 ); | |
3344 | 2522 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) |
2469 | 2523 { |
3344 | 2524 // printf("%d %d %d\n", dstWidth, i, srcW); |
2525 dst[i] = src1[srcW-1]*128; | |
2526 dst[i+2048] = src2[srcW-1]*128; | |
2469 | 2527 } |
2528 } | |
2529 else | |
2530 { | |
2531 #endif | |
2532 asm volatile( | |
2533 "xorl %%eax, %%eax \n\t" // i | |
2534 "xorl %%ebx, %%ebx \n\t" // xx | |
2535 "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
2536 ".balign 16 \n\t" |
2469 | 2537 "1: \n\t" |
2538 "movl %0, %%esi \n\t" | |
2539 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx] | |
2540 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1] | |
2541 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2542 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2543 "shll $16, %%edi \n\t" | |
2544 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2545 "movl %1, %%edi \n\t" | |
2546 "shrl $9, %%esi \n\t" | |
2547 "movw %%si, (%%edi, %%eax, 2) \n\t" | |
2548 | |
2549 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] | |
2550 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] | |
2551 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2552 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2553 "shll $16, %%edi \n\t" | |
2554 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2555 "movl %1, %%edi \n\t" | |
2556 "shrl $9, %%esi \n\t" | |
2557 "movw %%si, 4096(%%edi, %%eax, 2)\n\t" | |
2558 | |
2559 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
2560 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry | |
2561 "addl $1, %%eax \n\t" | |
2562 "cmpl %2, %%eax \n\t" | |
2563 " jb 1b \n\t" | |
2564 | |
2565 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF), | |
2566 "r" (src2) | |
2567 : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |
2568 ); | |
2569 #ifdef HAVE_MMX2 | |
2570 } //if MMX2 cant be used | |
2571 #endif | |
2572 #else | |
2671 | 2573 int i; |
2574 unsigned int xpos=0; | |
2575 for(i=0;i<dstWidth;i++) | |
2576 { | |
2577 register unsigned int xx=xpos>>16; | |
2578 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2579 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |
2580 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | |
2566 | 2581 /* slower |
2582 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; | |
2583 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; | |
2584 */ | |
2671 | 2585 xpos+=xInc; |
2586 } | |
2469 | 2587 #endif |
3272 | 2588 } |
2589 } | |
2590 | |
4467 | 2591 static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY, |
4698 | 2592 int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){ |
3344 | 2593 |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2594 /* load a few things into local vars to make the code more readable? and faster */ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2595 const int srcW= c->srcW; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2596 const int dstW= c->dstW; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2597 const int dstH= c->dstH; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2598 const int chrDstW= c->chrDstW; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2599 const int lumXInc= c->lumXInc; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2600 const int chrXInc= c->chrXInc; |
4295 | 2601 const int dstFormat= c->dstFormat; |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2602 const int flags= c->flags; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2603 const int canMMX2BeUsed= c->canMMX2BeUsed; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2604 int16_t *vLumFilterPos= c->vLumFilterPos; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2605 int16_t *vChrFilterPos= c->vChrFilterPos; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2606 int16_t *hLumFilterPos= c->hLumFilterPos; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2607 int16_t *hChrFilterPos= c->hChrFilterPos; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2608 int16_t *vLumFilter= c->vLumFilter; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2609 int16_t *vChrFilter= c->vChrFilter; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2610 int16_t *hLumFilter= c->hLumFilter; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2611 int16_t *hChrFilter= c->hChrFilter; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2612 int16_t *lumMmxFilter= c->lumMmxFilter; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2613 int16_t *chrMmxFilter= c->chrMmxFilter; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2614 const int vLumFilterSize= c->vLumFilterSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2615 const int vChrFilterSize= c->vChrFilterSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2616 const int hLumFilterSize= c->hLumFilterSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2617 const int hChrFilterSize= c->hChrFilterSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2618 int16_t **lumPixBuf= c->lumPixBuf; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2619 int16_t **chrPixBuf= c->chrPixBuf; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2620 const int vLumBufSize= c->vLumBufSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2621 const int vChrBufSize= c->vChrBufSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2622 uint8_t *funnyYCode= c->funnyYCode; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2623 uint8_t *funnyUVCode= c->funnyUVCode; |
4467 | 2624 uint8_t *formatConvBuffer= c->formatConvBuffer; |
3344 | 2625 |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2626 /* vars whch will change and which we need to storw back in the context */ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2627 int dstY= c->dstY; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2628 int lumBufIndex= c->lumBufIndex; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2629 int chrBufIndex= c->chrBufIndex; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2630 int lastInLumBuf= c->lastInLumBuf; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2631 int lastInChrBuf= c->lastInChrBuf; |
4467 | 2632 int srcStride[3]; |
4698 | 2633 int dstStride[3]; |
4419 | 2634 uint8_t *src[3]; |
2635 uint8_t *dst[3]; | |
2636 | |
4554 | 2637 if(c->srcFormat == IMGFMT_I420){ |
4419 | 2638 src[0]= srcParam[0]; |
2639 src[1]= srcParam[2]; | |
2640 src[2]= srcParam[1]; | |
4467 | 2641 srcStride[0]= srcStrideParam[0]; |
2642 srcStride[1]= srcStrideParam[2]; | |
2643 srcStride[2]= srcStrideParam[1]; | |
2644 } | |
2645 else if(c->srcFormat==IMGFMT_YV12){ | |
4419 | 2646 src[0]= srcParam[0]; |
2647 src[1]= srcParam[1]; | |
2648 src[2]= srcParam[2]; | |
4467 | 2649 srcStride[0]= srcStrideParam[0]; |
2650 srcStride[1]= srcStrideParam[1]; | |
2651 srcStride[2]= srcStrideParam[2]; | |
2652 } | |
2653 else if(isPacked(c->srcFormat)){ | |
2654 src[0]= | |
2655 src[1]= | |
2656 src[2]= srcParam[0]; | |
2657 srcStride[0]= srcStrideParam[0]; | |
2658 srcStride[1]= | |
2659 srcStride[2]= srcStrideParam[0]<<1; | |
2660 } | |
4481 | 2661 else if(isGray(c->srcFormat)){ |
4467 | 2662 src[0]= srcParam[0]; |
2663 src[1]= | |
2664 src[2]= NULL; | |
2665 srcStride[0]= srcStrideParam[0]; | |
2666 srcStride[1]= | |
2667 srcStride[2]= 0; | |
4419 | 2668 } |
2669 | |
4698 | 2670 if(dstFormat == IMGFMT_I420){ |
4419 | 2671 dst[0]= dstParam[0]; |
2672 dst[1]= dstParam[2]; | |
2673 dst[2]= dstParam[1]; | |
4698 | 2674 dstStride[0]= dstStrideParam[0]; |
2675 dstStride[1]= dstStrideParam[2]; | |
2676 dstStride[2]= dstStrideParam[1]; | |
4419 | 2677 }else{ |
2678 dst[0]= dstParam[0]; | |
2679 dst[1]= dstParam[1]; | |
2680 dst[2]= dstParam[2]; | |
4698 | 2681 dstStride[0]= dstStrideParam[0]; |
2682 dstStride[1]= dstStrideParam[1]; | |
2683 dstStride[2]= dstStrideParam[2]; | |
4419 | 2684 } |
4554 | 2685 |
2686 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2], | |
2687 //dstStride[0],dstStride[1],dstStride[2]); | |
4419 | 2688 |
2689 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0) | |
2690 { | |
2691 static int firstTime=1; //FIXME move this into the context perhaps | |
2692 if(flags & SWS_PRINT_INFO && firstTime) | |
2693 { | |
2694 fprintf(stderr, "SwScaler: Warning: dstStride is not aligned!\n" | |
2695 "SwScaler: ->cannot do aligned memory acesses anymore\n"); | |
2696 firstTime=0; | |
2697 } | |
2698 } | |
3344 | 2699 |
4467 | 2700 /* Note the user might start scaling the picture in the middle so this will not get executed |
2701 this is not really intended but works currently, so ppl might do it */ | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2702 if(srcSliceY ==0){ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2703 lumBufIndex=0; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2704 chrBufIndex=0; |
4467 | 2705 dstY=0; |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2706 lastInLumBuf= -1; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2707 lastInChrBuf= -1; |
3272 | 2708 } |
3344 | 2709 |
2710 for(;dstY < dstH; dstY++){ | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2711 unsigned char *dest =dst[0]+dstStride[0]*dstY; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2712 unsigned char *uDest=dst[1]+dstStride[1]*(dstY>>1); |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2713 unsigned char *vDest=dst[2]+dstStride[2]*(dstY>>1); |
4419 | 2714 const int chrDstY= isHalfChrV(dstFormat) ? (dstY>>1) : dstY; |
3344 | 2715 |
2716 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input | |
2717 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input | |
2718 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input | |
2719 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input | |
2720 | |
4290
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2721 //handle holes (FAST_BILINEAR & weird filters) |
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2722 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; |
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2723 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; |
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2724 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize); |
3344 | 2725 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1) |
2726 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1) | |
2216 | 2727 |
3344 | 2728 // Do we have enough lines in this slice to output the dstY line |
4698 | 2729 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH + 1)>>1)) |
2469 | 2730 { |
3344 | 2731 //Do horizontal scaling |
2732 while(lastInLumBuf < lastLumSrcY) | |
2733 { | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2734 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; |
3344 | 2735 lumBufIndex++; |
4290
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2736 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY); |
3344 | 2737 ASSERT(lumBufIndex < 2*vLumBufSize) |
2738 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
2739 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
2740 // printf("%d %d\n", lumBufIndex, vLumBufSize); | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2741 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2742 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, |
4467 | 2743 funnyYCode, c->srcFormat, formatConvBuffer); |
3344 | 2744 lastInLumBuf++; |
2745 } | |
2746 while(lastInChrBuf < lastChrSrcY) | |
2747 { | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2748 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2749 uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2]; |
3344 | 2750 chrBufIndex++; |
2751 ASSERT(chrBufIndex < 2*vChrBufSize) | |
4698 | 2752 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1)) |
3344 | 2753 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2754 //FIXME replace parameters through context struct (some at least) |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2755 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc, |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2756 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, |
4467 | 2757 funnyUVCode, c->srcFormat, formatConvBuffer); |
3344 | 2758 lastInChrBuf++; |
2759 } | |
2760 //wrap buf index around to stay inside the ring buffer | |
2761 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
2762 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
2469 | 2763 } |
3344 | 2764 else // not enough lines left in this slice -> load the rest in the buffer |
2469 | 2765 { |
3344 | 2766 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n", |
2767 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY, | |
2768 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize, | |
2769 vChrBufSize, vLumBufSize); | |
2770 */ | |
2771 //Do horizontal scaling | |
2772 while(lastInLumBuf+1 < srcSliceY + srcSliceH) | |
2469 | 2773 { |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2774 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; |
3344 | 2775 lumBufIndex++; |
2776 ASSERT(lumBufIndex < 2*vLumBufSize) | |
2777 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
2778 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2779 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2780 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, |
4467 | 2781 funnyYCode, c->srcFormat, formatConvBuffer); |
3344 | 2782 lastInLumBuf++; |
2469 | 2783 } |
3344 | 2784 while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1)) |
2785 { | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2786 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2787 uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2]; |
3344 | 2788 chrBufIndex++; |
2789 ASSERT(chrBufIndex < 2*vChrBufSize) | |
4698 | 2790 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1)) |
3344 | 2791 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2792 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc, |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2793 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, |
4467 | 2794 funnyUVCode, c->srcFormat, formatConvBuffer); |
3344 | 2795 lastInChrBuf++; |
2796 } | |
2797 //wrap buf index around to stay inside the ring buffer | |
2798 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
2799 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
2800 break; //we cant output a dstY line so lets try with the next slice | |
2469 | 2801 } |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
2802 |
2748 | 2803 #ifdef HAVE_MMX |
3344 | 2804 b5Dither= dither8[dstY&1]; |
2805 g6Dither= dither4[dstY&1]; | |
2806 g5Dither= dither8[dstY&1]; | |
2807 r5Dither= dither8[(dstY+1)&1]; | |
2748 | 2808 #endif |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2809 if(dstY < dstH-2) |
3352 | 2810 { |
4419 | 2811 if(isPlanarYUV(dstFormat)) //YV12 like |
3344 | 2812 { |
2813 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
2814 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12 | |
2815 { | |
2816 int16_t *lumBuf = lumPixBuf[0]; | |
2817 int16_t *chrBuf= chrPixBuf[0]; | |
2818 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW); | |
2819 } | |
2820 else //General YV12 | |
2821 { | |
2822 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
2823 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
2824 RENAME(yuv2yuvX)( | |
2825 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2826 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2827 dest, uDest, vDest, dstW, | |
2828 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4); | |
2829 } | |
2830 } | |
2831 else | |
2832 { | |
2833 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
2834 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
2835 | |
2836 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
2837 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
2838 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB | |
2839 { | |
2840 int chrAlpha= vChrFilter[2*dstY+1]; | |
2841 | |
2842 RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2843 dest, dstW, chrAlpha, dstFormat, flags); |
3344 | 2844 } |
2845 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB | |
2846 { | |
2847 int lumAlpha= vLumFilter[2*dstY+1]; | |
2848 int chrAlpha= vChrFilter[2*dstY+1]; | |
2849 | |
2850 RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2851 dest, dstW, lumAlpha, chrAlpha, dstFormat, flags); |
3344 | 2852 } |
2853 else //General RGB | |
2854 { | |
2855 RENAME(yuv2rgbX)( | |
2856 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
2857 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2858 dest, dstW, dstFormat, |
3344 | 2859 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4); |
2860 } | |
2861 } | |
3352 | 2862 } |
2863 else // hmm looks like we cant use MMX here without overwriting this arrays tail | |
2864 { | |
2865 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
2866 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
4419 | 2867 if(isPlanarYUV(dstFormat)) //YV12 |
3352 | 2868 { |
2869 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
2870 yuv2yuvXinC( | |
2871 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2872 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2873 dest, uDest, vDest, dstW); | |
2874 } | |
2875 else | |
2876 { | |
2877 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
2878 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
2879 yuv2rgbXinC( | |
2880 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
2881 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2882 dest, dstW, dstFormat); |
3352 | 2883 } |
2884 } | |
3344 | 2885 } |
2534
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2886 |
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2887 #ifdef HAVE_MMX |
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2888 __asm __volatile(SFENCE:::"memory"); |
2566 | 2889 __asm __volatile(EMMS:::"memory"); |
2534
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2890 #endif |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2891 /* store changed local vars back in the context */ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2892 c->dstY= dstY; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2893 c->lumBufIndex= lumBufIndex; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2894 c->chrBufIndex= chrBufIndex; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2895 c->lastInLumBuf= lastInLumBuf; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2896 c->lastInChrBuf= lastInChrBuf; |
3641 | 2897 } |