Mercurial > mplayer.hg
annotate postproc/swscale_template.c @ 13387:d1abaa60967a
Fixes indentation around my contributions, and adds lavc's "turbo" mode,
inspired by XviD "turbo" mode.
author | gpoirier |
---|---|
date | Sun, 19 Sep 2004 10:17:07 +0000 |
parents | d2aef091743c |
children | 821f464b4d90 |
rev | line source |
---|---|
4295 | 1 /* |
9476
eff727517e6b
yuv2rgb brightness/contrast/saturation/different colorspaces support finished
michael
parents:
9434
diff
changeset
|
2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> |
2216 | 3 |
4295 | 4 This program is free software; you can redistribute it and/or modify |
5 it under the terms of the GNU General Public License as published by | |
6 the Free Software Foundation; either version 2 of the License, or | |
7 (at your option) any later version. | |
2216 | 8 |
4295 | 9 This program is distributed in the hope that it will be useful, |
10 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 GNU General Public License for more details. | |
13 | |
14 You should have received a copy of the GNU General Public License | |
15 along with this program; if not, write to the Free Software | |
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
17 */ | |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
18 |
2540 | 19 #undef MOVNTQ |
2680 | 20 #undef PAVGB |
3136 | 21 #undef PREFETCH |
22 #undef PREFETCHW | |
23 #undef EMMS | |
24 #undef SFENCE | |
25 | |
26 #ifdef HAVE_3DNOW | |
27 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
28 #define EMMS "femms" | |
29 #else | |
30 #define EMMS "emms" | |
31 #endif | |
32 | |
33 #ifdef HAVE_3DNOW | |
34 #define PREFETCH "prefetch" | |
35 #define PREFETCHW "prefetchw" | |
36 #elif defined ( HAVE_MMX2 ) | |
37 #define PREFETCH "prefetchnta" | |
38 #define PREFETCHW "prefetcht0" | |
39 #else | |
40 #define PREFETCH "/nop" | |
41 #define PREFETCHW "/nop" | |
42 #endif | |
43 | |
44 #ifdef HAVE_MMX2 | |
45 #define SFENCE "sfence" | |
46 #else | |
47 #define SFENCE "/nop" | |
48 #endif | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
49 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
50 #ifdef HAVE_MMX2 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
51 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
52 #elif defined (HAVE_3DNOW) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
53 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
54 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
55 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
56 #ifdef HAVE_MMX2 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
57 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
58 #else |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
59 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
60 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
61 |
12017
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
62 #ifdef HAVE_ALTIVEC |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
63 #include "swscale_altivec_template.c" |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
64 #endif |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
65 |
9413 | 66 #define YSCALEYUV2YV12X(x, offset) \ |
3344 | 67 "xorl %%eax, %%eax \n\t"\ |
11122 | 68 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ |
69 "movq %%mm3, %%mm4 \n\t"\ | |
9413 | 70 "leal " offset "(%0), %%edx \n\t"\ |
71 "movl (%%edx), %%esi \n\t"\ | |
3344 | 72 ".balign 16 \n\t" /* FIXME Unroll? */\ |
73 "1: \n\t"\ | |
9413 | 74 "movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\ |
3344 | 75 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\ |
76 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\ | |
9413 | 77 "addl $16, %%edx \n\t"\ |
78 "movl (%%edx), %%esi \n\t"\ | |
79 "testl %%esi, %%esi \n\t"\ | |
3344 | 80 "pmulhw %%mm0, %%mm2 \n\t"\ |
81 "pmulhw %%mm0, %%mm5 \n\t"\ | |
82 "paddw %%mm2, %%mm3 \n\t"\ | |
83 "paddw %%mm5, %%mm4 \n\t"\ | |
84 " jnz 1b \n\t"\ | |
85 "psraw $3, %%mm3 \n\t"\ | |
86 "psraw $3, %%mm4 \n\t"\ | |
87 "packuswb %%mm4, %%mm3 \n\t"\ | |
9413 | 88 MOVNTQ(%%mm3, (%1, %%eax))\ |
3344 | 89 "addl $8, %%eax \n\t"\ |
9413 | 90 "cmpl %2, %%eax \n\t"\ |
11122 | 91 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ |
92 "movq %%mm3, %%mm4 \n\t"\ | |
9413 | 93 "leal " offset "(%0), %%edx \n\t"\ |
94 "movl (%%edx), %%esi \n\t"\ | |
3344 | 95 "jb 1b \n\t" |
96 | |
97 #define YSCALEYUV2YV121 \ | |
98 "movl %2, %%eax \n\t"\ | |
99 ".balign 16 \n\t" /* FIXME Unroll? */\ | |
100 "1: \n\t"\ | |
101 "movq (%0, %%eax, 2), %%mm0 \n\t"\ | |
102 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\ | |
103 "psraw $7, %%mm0 \n\t"\ | |
104 "psraw $7, %%mm1 \n\t"\ | |
105 "packuswb %%mm1, %%mm0 \n\t"\ | |
106 MOVNTQ(%%mm0, (%1, %%eax))\ | |
107 "addl $8, %%eax \n\t"\ | |
108 "jnc 1b \n\t" | |
109 | |
110 /* | |
111 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
112 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
113 "r" (dest), "m" (dstW), | |
114 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
115 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
116 */ | |
7723 | 117 #define YSCALEYUV2PACKEDX \ |
3344 | 118 "xorl %%eax, %%eax \n\t"\ |
119 ".balign 16 \n\t"\ | |
9413 | 120 "nop \n\t"\ |
3344 | 121 "1: \n\t"\ |
9413 | 122 "leal "CHR_MMX_FILTER_OFFSET"(%0), %%edx \n\t"\ |
123 "movl (%%edx), %%esi \n\t"\ | |
11122 | 124 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ |
125 "movq %%mm3, %%mm4 \n\t"\ | |
9413 | 126 ".balign 16 \n\t"\ |
3344 | 127 "2: \n\t"\ |
9413 | 128 "movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\ |
3344 | 129 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\ |
130 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\ | |
9413 | 131 "addl $16, %%edx \n\t"\ |
132 "movl (%%edx), %%esi \n\t"\ | |
3344 | 133 "pmulhw %%mm0, %%mm2 \n\t"\ |
134 "pmulhw %%mm0, %%mm5 \n\t"\ | |
135 "paddw %%mm2, %%mm3 \n\t"\ | |
136 "paddw %%mm5, %%mm4 \n\t"\ | |
9413 | 137 "testl %%esi, %%esi \n\t"\ |
3344 | 138 " jnz 2b \n\t"\ |
139 \ | |
9413 | 140 "leal "LUM_MMX_FILTER_OFFSET"(%0), %%edx \n\t"\ |
141 "movl (%%edx), %%esi \n\t"\ | |
11122 | 142 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\ |
143 "movq %%mm1, %%mm7 \n\t"\ | |
9413 | 144 ".balign 16 \n\t"\ |
3344 | 145 "2: \n\t"\ |
9413 | 146 "movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\ |
3344 | 147 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\ |
148 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\ | |
9413 | 149 "addl $16, %%edx \n\t"\ |
150 "movl (%%edx), %%esi \n\t"\ | |
3344 | 151 "pmulhw %%mm0, %%mm2 \n\t"\ |
152 "pmulhw %%mm0, %%mm5 \n\t"\ | |
153 "paddw %%mm2, %%mm1 \n\t"\ | |
154 "paddw %%mm5, %%mm7 \n\t"\ | |
9413 | 155 "testl %%esi, %%esi \n\t"\ |
3344 | 156 " jnz 2b \n\t"\ |
7723 | 157 |
158 | |
159 #define YSCALEYUV2RGBX \ | |
160 YSCALEYUV2PACKEDX\ | |
9413 | 161 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ |
162 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ | |
3344 | 163 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
164 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
9413 | 165 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ |
166 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ | |
3344 | 167 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
9413 | 168 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ |
169 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ | |
170 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ | |
171 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ | |
172 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ | |
173 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ | |
3344 | 174 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
175 "paddw %%mm3, %%mm4 \n\t"\ | |
176 "movq %%mm2, %%mm0 \n\t"\ | |
177 "movq %%mm5, %%mm6 \n\t"\ | |
178 "movq %%mm4, %%mm3 \n\t"\ | |
179 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
180 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
181 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
182 "paddw %%mm1, %%mm2 \n\t"\ | |
183 "paddw %%mm1, %%mm5 \n\t"\ | |
184 "paddw %%mm1, %%mm4 \n\t"\ | |
185 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
186 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
187 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
188 "paddw %%mm7, %%mm0 \n\t"\ | |
189 "paddw %%mm7, %%mm6 \n\t"\ | |
190 "paddw %%mm7, %%mm3 \n\t"\ | |
191 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
192 "packuswb %%mm0, %%mm2 \n\t"\ | |
193 "packuswb %%mm6, %%mm5 \n\t"\ | |
194 "packuswb %%mm3, %%mm4 \n\t"\ | |
195 "pxor %%mm7, %%mm7 \n\t" | |
9413 | 196 #if 0 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
197 #define FULL_YSCALEYUV2RGB \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
198 "pxor %%mm7, %%mm7 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
199 "movd %6, %%mm6 \n\t" /*yalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
200 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
201 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
202 "movd %7, %%mm5 \n\t" /*uvalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
203 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
204 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
205 "xorl %%eax, %%eax \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
206 ".balign 16 \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
207 "1: \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
208 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
209 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
210 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
211 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
212 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
213 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
214 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
215 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
216 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
217 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
218 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
219 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
220 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
221 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
222 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ |
4248 | 223 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\ |
224 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\ | |
225 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
226 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
227 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
228 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
229 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
4248 | 230 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
231 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
4248 | 232 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
233 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
4248 | 234 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
235 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
236 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
237 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ |
4248 | 238 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\ |
239 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
240 "paddw %%mm1, %%mm3 \n\t" /* B*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
241 "paddw %%mm1, %%mm0 \n\t" /* R*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
242 "packuswb %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
243 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
244 "packuswb %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
245 "paddw %%mm4, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
246 "paddw %%mm2, %%mm1 \n\t" /* G*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
247 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
248 "packuswb %%mm1, %%mm1 \n\t" |
9413 | 249 #endif |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
250 |
9414 | 251 #define YSCALEYUV2PACKED(index, c) \ |
252 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ | |
253 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\ | |
254 "psraw $3, %%mm0 \n\t"\ | |
255 "psraw $3, %%mm1 \n\t"\ | |
256 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\ | |
257 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\ | |
258 "xorl "#index", "#index" \n\t"\ | |
7723 | 259 ".balign 16 \n\t"\ |
260 "1: \n\t"\ | |
9414 | 261 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
262 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
263 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
264 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
7723 | 265 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
266 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
9414 | 267 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ |
7723 | 268 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
269 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
270 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
271 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
272 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
273 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
9414 | 274 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ |
275 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
276 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ | |
277 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ | |
7723 | 278 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
279 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
9414 | 280 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
281 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
7723 | 282 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
283 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
284 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
285 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
286 | |
9414 | 287 #define YSCALEYUV2RGB(index, c) \ |
288 "xorl "#index", "#index" \n\t"\ | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
289 ".balign 16 \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
290 "1: \n\t"\ |
9414 | 291 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
292 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
293 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\ | |
294 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
295 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
296 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ |
9414 | 297 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
298 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
299 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
300 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
301 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
302 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
303 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
9414 | 304 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
305 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
306 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
307 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
9414 | 308 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ |
309 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
310 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
9414 | 311 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ |
312 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
313 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\ | |
314 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
315 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
316 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ |
9414 | 317 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
318 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
319 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
320 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
321 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
322 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
9414 | 323 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ |
324 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
325 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
326 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
327 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
328 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
329 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
330 "paddw %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
331 "movq %%mm2, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
332 "movq %%mm5, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
333 "movq %%mm4, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
334 "punpcklwd %%mm2, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
335 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
336 "punpcklwd %%mm4, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
337 "paddw %%mm1, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
338 "paddw %%mm1, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
339 "paddw %%mm1, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
340 "punpckhwd %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
341 "punpckhwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
342 "punpckhwd %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
343 "paddw %%mm7, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
344 "paddw %%mm7, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
345 "paddw %%mm7, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
346 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
347 "packuswb %%mm0, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
348 "packuswb %%mm6, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
349 "packuswb %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
350 "pxor %%mm7, %%mm7 \n\t" |
7723 | 351 |
9417 | 352 #define YSCALEYUV2PACKED1(index, c) \ |
353 "xorl "#index", "#index" \n\t"\ | |
7723 | 354 ".balign 16 \n\t"\ |
355 "1: \n\t"\ | |
9417 | 356 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ |
357 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
7723 | 358 "psraw $7, %%mm3 \n\t" \ |
359 "psraw $7, %%mm4 \n\t" \ | |
9417 | 360 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
361 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
7723 | 362 "psraw $7, %%mm1 \n\t" \ |
363 "psraw $7, %%mm7 \n\t" \ | |
364 | |
9417 | 365 #define YSCALEYUV2RGB1(index, c) \ |
366 "xorl "#index", "#index" \n\t"\ | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
367 ".balign 16 \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
368 "1: \n\t"\ |
9417 | 369 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ |
370 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
371 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
372 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
9417 | 373 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
374 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
375 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
376 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
9417 | 377 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ |
378 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
379 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
9417 | 380 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
381 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
382 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
383 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
9417 | 384 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ |
385 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
386 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
387 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
388 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
389 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
390 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
391 "paddw %%mm3, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
392 "movq %%mm2, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
393 "movq %%mm5, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
394 "movq %%mm4, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
395 "punpcklwd %%mm2, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
396 "punpcklwd %%mm5, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
397 "punpcklwd %%mm4, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
398 "paddw %%mm1, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
399 "paddw %%mm1, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
400 "paddw %%mm1, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
401 "punpckhwd %%mm0, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
402 "punpckhwd %%mm6, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
403 "punpckhwd %%mm3, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
404 "paddw %%mm7, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
405 "paddw %%mm7, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
406 "paddw %%mm7, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
407 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
408 "packuswb %%mm0, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
409 "packuswb %%mm6, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
410 "packuswb %%mm3, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
411 "pxor %%mm7, %%mm7 \n\t" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
412 |
9417 | 413 #define YSCALEYUV2PACKED1b(index, c) \ |
414 "xorl "#index", "#index" \n\t"\ | |
7723 | 415 ".balign 16 \n\t"\ |
416 "1: \n\t"\ | |
9417 | 417 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
418 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
419 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
420 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
7723 | 421 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
422 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
423 "psrlw $8, %%mm3 \n\t" \ | |
424 "psrlw $8, %%mm4 \n\t" \ | |
9417 | 425 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
426 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
7723 | 427 "psraw $7, %%mm1 \n\t" \ |
428 "psraw $7, %%mm7 \n\t" | |
429 | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
430 // do vertical chrominance interpolation |
9417 | 431 #define YSCALEYUV2RGB1b(index, c) \ |
432 "xorl "#index", "#index" \n\t"\ | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
433 ".balign 16 \n\t"\ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
434 "1: \n\t"\ |
9417 | 435 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
436 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
437 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
438 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
2576 | 439 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
440 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
3344 | 441 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ |
442 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ | |
9417 | 443 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
444 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
445 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
446 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
9417 | 447 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ |
448 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
449 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
9417 | 450 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
451 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
452 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
453 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
9417 | 454 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ |
455 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
456 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
457 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
458 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
459 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
460 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
461 "paddw %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
462 "movq %%mm2, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
463 "movq %%mm5, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
464 "movq %%mm4, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
465 "punpcklwd %%mm2, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
466 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
467 "punpcklwd %%mm4, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
468 "paddw %%mm1, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
469 "paddw %%mm1, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
470 "paddw %%mm1, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
471 "punpckhwd %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
472 "punpckhwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
473 "punpckhwd %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
474 "paddw %%mm7, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
475 "paddw %%mm7, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
476 "paddw %%mm7, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
477 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
478 "packuswb %%mm0, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
479 "packuswb %%mm6, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
480 "packuswb %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
481 "pxor %%mm7, %%mm7 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
482 |
9414 | 483 #define WRITEBGR32(dst, dstw, index) \ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
484 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
485 "movq %%mm2, %%mm1 \n\t" /* B */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
486 "movq %%mm5, %%mm6 \n\t" /* R */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
487 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
488 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
489 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
490 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
491 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
492 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
493 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
494 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
495 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
496 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
497 \ |
9414 | 498 MOVNTQ(%%mm0, (dst, index, 4))\ |
499 MOVNTQ(%%mm2, 8(dst, index, 4))\ | |
500 MOVNTQ(%%mm1, 16(dst, index, 4))\ | |
501 MOVNTQ(%%mm3, 24(dst, index, 4))\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
502 \ |
9414 | 503 "addl $8, "#index" \n\t"\ |
504 "cmpl "#dstw", "#index" \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
505 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
506 |
9414 | 507 #define WRITEBGR16(dst, dstw, index) \ |
4248 | 508 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
509 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ | |
510 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
2669 | 511 "psrlq $3, %%mm2 \n\t"\ |
512 \ | |
513 "movq %%mm2, %%mm1 \n\t"\ | |
514 "movq %%mm4, %%mm3 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
515 \ |
2669 | 516 "punpcklbw %%mm7, %%mm3 \n\t"\ |
517 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
518 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
519 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
520 \ |
2669 | 521 "psllq $3, %%mm3 \n\t"\ |
522 "psllq $3, %%mm4 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
523 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
524 "por %%mm3, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
525 "por %%mm4, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
526 \ |
9414 | 527 MOVNTQ(%%mm2, (dst, index, 2))\ |
528 MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
529 \ |
9414 | 530 "addl $8, "#index" \n\t"\ |
531 "cmpl "#dstw", "#index" \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
532 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
533 |
9414 | 534 #define WRITEBGR15(dst, dstw, index) \ |
4248 | 535 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
536 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ | |
537 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
2669 | 538 "psrlq $3, %%mm2 \n\t"\ |
539 "psrlq $1, %%mm5 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
540 \ |
2669 | 541 "movq %%mm2, %%mm1 \n\t"\ |
542 "movq %%mm4, %%mm3 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
543 \ |
2669 | 544 "punpcklbw %%mm7, %%mm3 \n\t"\ |
545 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
546 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
547 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
548 \ |
2669 | 549 "psllq $2, %%mm3 \n\t"\ |
550 "psllq $2, %%mm4 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
551 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
552 "por %%mm3, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
553 "por %%mm4, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
554 \ |
9414 | 555 MOVNTQ(%%mm2, (dst, index, 2))\ |
556 MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
557 \ |
9414 | 558 "addl $8, "#index" \n\t"\ |
559 "cmpl "#dstw", "#index" \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
560 " jb 1b \n\t" |
2669 | 561 |
9414 | 562 #define WRITEBGR24OLD(dst, dstw, index) \ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
563 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
564 "movq %%mm2, %%mm1 \n\t" /* B */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
565 "movq %%mm5, %%mm6 \n\t" /* R */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
566 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
567 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
568 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
569 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
570 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
571 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ |
2326 | 572 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ |
573 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
574 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
575 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
576 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
577 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
578 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ |
4248 | 579 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\ |
580 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
581 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
582 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
583 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
584 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
585 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
586 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
587 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
588 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
589 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ |
4248 | 590 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
591 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
592 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ |
4248 | 593 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\ |
594 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
595 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
596 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
597 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
598 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
599 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
600 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
601 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
602 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ |
4248 | 603 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\ |
604 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
605 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
606 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
607 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
608 \ |
9414 | 609 MOVNTQ(%%mm0, (dst))\ |
610 MOVNTQ(%%mm2, 8(dst))\ | |
611 MOVNTQ(%%mm3, 16(dst))\ | |
612 "addl $24, "#dst" \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
613 \ |
9414 | 614 "addl $8, "#index" \n\t"\ |
615 "cmpl "#dstw", "#index" \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
616 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
617 |
9414 | 618 #define WRITEBGR24MMX(dst, dstw, index) \ |
2730 | 619 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
620 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
621 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
622 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
623 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
624 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
625 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
626 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
627 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
628 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
629 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
630 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
631 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
632 \ | |
633 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
634 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ | |
635 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ | |
636 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ | |
637 \ | |
638 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ | |
639 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ | |
640 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ | |
641 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ | |
642 \ | |
643 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ | |
644 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ | |
645 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ | |
646 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ | |
647 \ | |
648 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
649 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ | |
650 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ | |
651 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
9414 | 652 MOVNTQ(%%mm0, (dst))\ |
2730 | 653 \ |
654 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ | |
655 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ | |
656 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ | |
657 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ | |
9414 | 658 MOVNTQ(%%mm6, 8(dst))\ |
2730 | 659 \ |
660 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ | |
661 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
662 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ | |
9414 | 663 MOVNTQ(%%mm5, 16(dst))\ |
2730 | 664 \ |
9414 | 665 "addl $24, "#dst" \n\t"\ |
2730 | 666 \ |
9414 | 667 "addl $8, "#index" \n\t"\ |
668 "cmpl "#dstw", "#index" \n\t"\ | |
2730 | 669 " jb 1b \n\t" |
670 | |
9414 | 671 #define WRITEBGR24MMX2(dst, dstw, index) \ |
2730 | 672 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
4248 | 673 "movq "MANGLE(M24A)", %%mm0 \n\t"\ |
674 "movq "MANGLE(M24C)", %%mm7 \n\t"\ | |
2730 | 675 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ |
676 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ | |
677 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ | |
678 \ | |
679 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ | |
680 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ | |
681 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ | |
682 \ | |
683 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ | |
684 "por %%mm1, %%mm6 \n\t"\ | |
685 "por %%mm3, %%mm6 \n\t"\ | |
9414 | 686 MOVNTQ(%%mm6, (dst))\ |
2730 | 687 \ |
688 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ | |
689 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ | |
690 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ | |
691 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ | |
692 \ | |
4248 | 693 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ |
2730 | 694 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ |
695 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ | |
696 \ | |
697 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ | |
698 "por %%mm3, %%mm6 \n\t"\ | |
9414 | 699 MOVNTQ(%%mm6, 8(dst))\ |
2730 | 700 \ |
701 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ | |
702 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ | |
703 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ | |
704 \ | |
705 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ | |
706 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ | |
4248 | 707 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ |
2730 | 708 \ |
709 "por %%mm1, %%mm3 \n\t"\ | |
710 "por %%mm3, %%mm6 \n\t"\ | |
9414 | 711 MOVNTQ(%%mm6, 16(dst))\ |
2730 | 712 \ |
9414 | 713 "addl $24, "#dst" \n\t"\ |
2730 | 714 \ |
9414 | 715 "addl $8, "#index" \n\t"\ |
716 "cmpl "#dstw", "#index" \n\t"\ | |
2730 | 717 " jb 1b \n\t" |
718 | |
719 #ifdef HAVE_MMX2 | |
3126 | 720 #undef WRITEBGR24 |
2730 | 721 #define WRITEBGR24 WRITEBGR24MMX2 |
722 #else | |
3126 | 723 #undef WRITEBGR24 |
2730 | 724 #define WRITEBGR24 WRITEBGR24MMX |
725 #endif | |
726 | |
9414 | 727 #define WRITEYUY2(dst, dstw, index) \ |
7723 | 728 "packuswb %%mm3, %%mm3 \n\t"\ |
729 "packuswb %%mm4, %%mm4 \n\t"\ | |
730 "packuswb %%mm7, %%mm1 \n\t"\ | |
731 "punpcklbw %%mm4, %%mm3 \n\t"\ | |
732 "movq %%mm1, %%mm7 \n\t"\ | |
733 "punpcklbw %%mm3, %%mm1 \n\t"\ | |
734 "punpckhbw %%mm3, %%mm7 \n\t"\ | |
735 \ | |
9414 | 736 MOVNTQ(%%mm1, (dst, index, 2))\ |
737 MOVNTQ(%%mm7, 8(dst, index, 2))\ | |
7723 | 738 \ |
9414 | 739 "addl $8, "#index" \n\t"\ |
740 "cmpl "#dstw", "#index" \n\t"\ | |
7723 | 741 " jb 1b \n\t" |
742 | |
743 | |
9413 | 744 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
3344 | 745 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, |
9414 | 746 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW) |
2519 | 747 { |
3344 | 748 #ifdef HAVE_MMX |
749 if(uDest != NULL) | |
750 { | |
751 asm volatile( | |
9413 | 752 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET) |
753 :: "r" (&c->redDither), | |
754 "r" (uDest), "m" (chrDstW) | |
3344 | 755 : "%eax", "%edx", "%esi" |
756 ); | |
2519 | 757 |
3344 | 758 asm volatile( |
9413 | 759 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET) |
760 :: "r" (&c->redDither), | |
761 "r" (vDest), "m" (chrDstW) | |
3344 | 762 : "%eax", "%edx", "%esi" |
763 ); | |
764 } | |
2521 | 765 |
3344 | 766 asm volatile( |
9413 | 767 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET) |
768 :: "r" (&c->redDither), | |
769 "r" (dest), "m" (dstW) | |
3344 | 770 : "%eax", "%edx", "%esi" |
771 ); | |
772 #else | |
12017
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
773 #ifdef HAVE_ALTIVEC |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
774 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
775 chrFilter, chrSrc, chrFilterSize, |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
776 dest, uDest, vDest, dstW, chrDstW); |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
777 #else //HAVE_ALTIVEC |
6540 | 778 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, |
3352 | 779 chrFilter, chrSrc, chrFilterSize, |
6540 | 780 dest, uDest, vDest, dstW, chrDstW); |
12017
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
781 #endif //!HAVE_ALTIVEC |
3344 | 782 #endif |
783 } | |
784 | |
785 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc, | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
786 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW) |
3344 | 787 { |
788 #ifdef HAVE_MMX | |
789 if(uDest != NULL) | |
790 { | |
791 asm volatile( | |
792 YSCALEYUV2YV121 | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
793 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW), |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
794 "g" (-chrDstW) |
3344 | 795 : "%eax" |
796 ); | |
797 | |
798 asm volatile( | |
799 YSCALEYUV2YV121 | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
800 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW), |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
801 "g" (-chrDstW) |
3344 | 802 : "%eax" |
803 ); | |
2519 | 804 } |
3344 | 805 |
806 asm volatile( | |
807 YSCALEYUV2YV121 | |
808 :: "r" (lumSrc + dstW), "r" (dest + dstW), | |
809 "g" (-dstW) | |
810 : "%eax" | |
811 ); | |
812 #else | |
813 int i; | |
814 for(i=0; i<dstW; i++) | |
815 { | |
816 int val= lumSrc[i]>>7; | |
6503 | 817 |
818 if(val&256){ | |
819 if(val<0) val=0; | |
820 else val=255; | |
821 } | |
3344 | 822 |
6503 | 823 dest[i]= val; |
3344 | 824 } |
825 | |
826 if(uDest != NULL) | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
827 for(i=0; i<chrDstW; i++) |
3344 | 828 { |
829 int u=chrSrc[i]>>7; | |
830 int v=chrSrc[i + 2048]>>7; | |
831 | |
6503 | 832 if((u|v)&256){ |
833 if(u<0) u=0; | |
834 else if (u>255) u=255; | |
835 if(v<0) v=0; | |
836 else if (v>255) v=255; | |
837 } | |
838 | |
839 uDest[i]= u; | |
840 vDest[i]= v; | |
3344 | 841 } |
842 #endif | |
2519 | 843 } |
844 | |
3344 | 845 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
846 /** |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
847 * vertical scale YV12 to RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
848 */ |
7723 | 849 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
3344 | 850 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, |
9413 | 851 uint8_t *dest, int dstW, int dstY) |
3344 | 852 { |
9413 | 853 int dummy=0; |
6578 | 854 switch(c->dstFormat) |
3344 | 855 { |
856 #ifdef HAVE_MMX | |
6578 | 857 case IMGFMT_BGR32: |
3344 | 858 { |
859 asm volatile( | |
860 YSCALEYUV2RGBX | |
9414 | 861 WRITEBGR32(%4, %5, %%eax) |
3344 | 862 |
9413 | 863 :: "r" (&c->redDither), |
864 "m" (dummy), "m" (dummy), "m" (dummy), | |
865 "r" (dest), "m" (dstW) | |
866 : "%eax", "%edx", "%esi" | |
3344 | 867 ); |
868 } | |
6578 | 869 break; |
870 case IMGFMT_BGR24: | |
3344 | 871 { |
872 asm volatile( | |
873 YSCALEYUV2RGBX | |
874 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize | |
875 "addl %4, %%ebx \n\t" | |
9414 | 876 WRITEBGR24(%%ebx, %5, %%eax) |
3344 | 877 |
9413 | 878 :: "r" (&c->redDither), |
879 "m" (dummy), "m" (dummy), "m" (dummy), | |
880 "r" (dest), "m" (dstW) | |
881 : "%eax", "%ebx", "%edx", "%esi" //FIXME ebx | |
3344 | 882 ); |
883 } | |
6578 | 884 break; |
885 case IMGFMT_BGR15: | |
3344 | 886 { |
887 asm volatile( | |
888 YSCALEYUV2RGBX | |
889 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
890 #ifdef DITHER1XBPP | |
4248 | 891 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
892 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
893 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
3344 | 894 #endif |
895 | |
9414 | 896 WRITEBGR15(%4, %5, %%eax) |
3344 | 897 |
9413 | 898 :: "r" (&c->redDither), |
899 "m" (dummy), "m" (dummy), "m" (dummy), | |
900 "r" (dest), "m" (dstW) | |
901 : "%eax", "%edx", "%esi" | |
3344 | 902 ); |
903 } | |
6578 | 904 break; |
905 case IMGFMT_BGR16: | |
3344 | 906 { |
907 asm volatile( | |
908 YSCALEYUV2RGBX | |
909 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
910 #ifdef DITHER1XBPP | |
4248 | 911 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
912 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
913 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
3344 | 914 #endif |
915 | |
9414 | 916 WRITEBGR16(%4, %5, %%eax) |
3344 | 917 |
9413 | 918 :: "r" (&c->redDither), |
919 "m" (dummy), "m" (dummy), "m" (dummy), | |
920 "r" (dest), "m" (dstW) | |
921 : "%eax", "%edx", "%esi" | |
3344 | 922 ); |
923 } | |
6578 | 924 break; |
7723 | 925 case IMGFMT_YUY2: |
926 { | |
927 asm volatile( | |
928 YSCALEYUV2PACKEDX | |
929 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
930 | |
931 "psraw $3, %%mm3 \n\t" | |
932 "psraw $3, %%mm4 \n\t" | |
933 "psraw $3, %%mm1 \n\t" | |
934 "psraw $3, %%mm7 \n\t" | |
9414 | 935 WRITEYUY2(%4, %5, %%eax) |
7723 | 936 |
9413 | 937 :: "r" (&c->redDither), |
938 "m" (dummy), "m" (dummy), "m" (dummy), | |
939 "r" (dest), "m" (dstW) | |
940 : "%eax", "%edx", "%esi" | |
7723 | 941 ); |
942 } | |
943 break; | |
3344 | 944 #endif |
6578 | 945 default: |
12698 | 946 #ifdef HAVE_ALTIVEC |
947 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize, | |
948 chrFilter, chrSrc, chrFilterSize, | |
949 dest, dstW, dstY); | |
950 #else | |
7723 | 951 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, |
6578 | 952 chrFilter, chrSrc, chrFilterSize, |
953 dest, dstW, dstY); | |
12698 | 954 #endif |
6578 | 955 break; |
956 } | |
3344 | 957 } |
958 | |
959 /** | |
960 * vertical bilinear scale YV12 to RGB | |
961 */ | |
7723 | 962 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, |
6578 | 963 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
964 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
965 int yalpha1=yalpha^4095; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
966 int uvalpha1=uvalpha^4095; |
6578 | 967 int i; |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
968 |
11000 | 969 #if 0 //isn't used |
4467 | 970 if(flags&SWS_FULL_CHR_H_INT) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
971 { |
6578 | 972 switch(dstFormat) |
973 { | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
974 #ifdef HAVE_MMX |
6578 | 975 case IMGFMT_BGR32: |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
976 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
977 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
978 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
979 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
980 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
981 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
982 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
983 "movq %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
984 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
985 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
986 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
987 MOVNTQ(%%mm3, (%4, %%eax, 4)) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
988 MOVNTQ(%%mm1, 8(%4, %%eax, 4)) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
989 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
990 "addl $4, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
991 "cmpl %5, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
992 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
993 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
994 |
3209 | 995 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
996 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
997 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
998 ); |
6578 | 999 break; |
1000 case IMGFMT_BGR24: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1001 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1002 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1003 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1004 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1005 // lsb ... msb |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1006 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1007 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1008 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1009 "movq %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1010 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1011 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1012 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1013 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1014 "psrlq $8, %%mm3 \n\t" // GR0BGR00 |
4248 | 1015 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000 |
1016 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00 | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1017 "por %%mm2, %%mm3 \n\t" // BGRBGR00 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1018 "movq %%mm1, %%mm2 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1019 "psllq $48, %%mm1 \n\t" // 000000BG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1020 "por %%mm1, %%mm3 \n\t" // BGRBGRBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1021 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1022 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1023 "psrld $16, %%mm2 \n\t" // R000R000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1024 "psrlq $24, %%mm1 \n\t" // 0BGR0000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1025 "por %%mm2, %%mm1 \n\t" // RBGRR000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1026 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1027 "movl %4, %%ebx \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1028 "addl %%eax, %%ebx \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1029 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1030 #ifdef HAVE_MMX2 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1031 //FIXME Alignment |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1032 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1033 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1034 #else |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1035 "movd %%mm3, (%%ebx, %%eax, 2) \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1036 "psrlq $32, %%mm3 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1037 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1038 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1039 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1040 "addl $4, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1041 "cmpl %5, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1042 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1043 |
3209 | 1044 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1045 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1046 : "%eax", "%ebx" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1047 ); |
6578 | 1048 break; |
1049 case IMGFMT_BGR15: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1050 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1051 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1052 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1053 #ifdef DITHER1XBPP |
4248 | 1054 "paddusb "MANGLE(g5Dither)", %%mm1\n\t" |
1055 "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
1056 "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1057 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1058 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1059 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1060 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1061 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1062 "psrlw $3, %%mm3 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1063 "psllw $2, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1064 "psllw $7, %%mm0 \n\t" |
4248 | 1065 "pand "MANGLE(g15Mask)", %%mm1 \n\t" |
1066 "pand "MANGLE(r15Mask)", %%mm0 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1067 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1068 "por %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1069 "por %%mm1, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1070 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1071 MOVNTQ(%%mm0, (%4, %%eax, 2)) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1072 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1073 "addl $4, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1074 "cmpl %5, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1075 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1076 |
3209 | 1077 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1078 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1079 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1080 ); |
6578 | 1081 break; |
1082 case IMGFMT_BGR16: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1083 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1084 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1085 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1086 #ifdef DITHER1XBPP |
4248 | 1087 "paddusb "MANGLE(g6Dither)", %%mm1\n\t" |
1088 "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
1089 "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1090 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1091 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1092 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1093 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1094 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1095 "psrlw $3, %%mm3 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1096 "psllw $3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1097 "psllw $8, %%mm0 \n\t" |
4248 | 1098 "pand "MANGLE(g16Mask)", %%mm1 \n\t" |
1099 "pand "MANGLE(r16Mask)", %%mm0 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1100 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1101 "por %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1102 "por %%mm1, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1103 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1104 MOVNTQ(%%mm0, (%4, %%eax, 2)) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1105 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1106 "addl $4, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1107 "cmpl %5, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1108 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1109 |
3209 | 1110 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1111 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1112 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1113 ); |
6578 | 1114 break; |
1115 #endif | |
1116 case IMGFMT_RGB32: | |
1117 #ifndef HAVE_MMX | |
1118 case IMGFMT_BGR32: | |
1119 #endif | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1120 if(dstFormat==IMGFMT_BGR32) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1121 { |
4794 | 1122 int i; |
4793 | 1123 #ifdef WORDS_BIGENDIAN |
1124 dest++; | |
1125 #endif | |
3209 | 1126 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1127 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1128 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1129 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1130 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
2503 | 1131 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; |
1132 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1133 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1134 dest+= 4; |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1135 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1136 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1137 else if(dstFormat==IMGFMT_BGR24) |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1138 { |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1139 int i; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1140 for(i=0;i<dstW;i++){ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1141 // vertical linear interpolation && yuv2rgb in a single step: |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1142 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1143 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1144 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1145 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1146 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1147 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1148 dest+= 3; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1149 } |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1150 } |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1151 else if(dstFormat==IMGFMT_BGR16) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1152 { |
2671 | 1153 int i; |
3209 | 1154 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1155 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1156 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1157 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1158 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1159 |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1160 ((uint16_t*)dest)[i] = |
2584 | 1161 clip_table16b[(Y + yuvtab_40cf[U]) >>13] | |
1162 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1163 clip_table16r[(Y + yuvtab_3343[V]) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1164 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1165 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1166 else if(dstFormat==IMGFMT_BGR15) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1167 { |
2671 | 1168 int i; |
3209 | 1169 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1170 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1171 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1172 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1173 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1174 |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1175 ((uint16_t*)dest)[i] = |
2584 | 1176 clip_table15b[(Y + yuvtab_40cf[U]) >>13] | |
1177 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1178 clip_table15r[(Y + yuvtab_3343[V]) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1179 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1180 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1181 }//FULL_UV_IPOL |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1182 else |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1183 { |
6578 | 1184 #endif // if 0 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1185 #ifdef HAVE_MMX |
6578 | 1186 switch(c->dstFormat) |
1187 { | |
11000 | 1188 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( |
6578 | 1189 case IMGFMT_BGR32: |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1190 asm volatile( |
9414 | 1191 "movl %%esp, "ESP_OFFSET"(%5) \n\t" |
1192 "movl %4, %%esp \n\t" | |
1193 YSCALEYUV2RGB(%%eax, %5) | |
1194 WRITEBGR32(%%esp, 8280(%5), %%eax) | |
1195 "movl "ESP_OFFSET"(%5), %%esp \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1196 |
9414 | 1197 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), |
1198 "r" (&c->redDither) | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1199 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1200 ); |
6578 | 1201 return; |
1202 case IMGFMT_BGR24: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1203 asm volatile( |
9414 | 1204 "movl %%esp, "ESP_OFFSET"(%5) \n\t" |
1205 "movl %4, %%esp \n\t" | |
1206 YSCALEYUV2RGB(%%eax, %5) | |
1207 WRITEBGR24(%%esp, 8280(%5), %%eax) | |
1208 "movl "ESP_OFFSET"(%5), %%esp \n\t" | |
1209 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1210 "r" (&c->redDither) | |
1211 : "%eax" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1212 ); |
6578 | 1213 return; |
1214 case IMGFMT_BGR15: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1215 asm volatile( |
9414 | 1216 "movl %%esp, "ESP_OFFSET"(%5) \n\t" |
1217 "movl %4, %%esp \n\t" | |
1218 YSCALEYUV2RGB(%%eax, %5) | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1219 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1220 #ifdef DITHER1XBPP |
4248 | 1221 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1222 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1223 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1224 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1225 |
9414 | 1226 WRITEBGR15(%%esp, 8280(%5), %%eax) |
1227 "movl "ESP_OFFSET"(%5), %%esp \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1228 |
9414 | 1229 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), |
1230 "r" (&c->redDither) | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1231 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1232 ); |
6578 | 1233 return; |
1234 case IMGFMT_BGR16: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1235 asm volatile( |
9414 | 1236 "movl %%esp, "ESP_OFFSET"(%5) \n\t" |
1237 "movl %4, %%esp \n\t" | |
1238 YSCALEYUV2RGB(%%eax, %5) | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1239 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1240 #ifdef DITHER1XBPP |
4248 | 1241 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1242 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1243 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1244 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1245 |
9414 | 1246 WRITEBGR16(%%esp, 8280(%5), %%eax) |
1247 "movl "ESP_OFFSET"(%5), %%esp \n\t" | |
1248 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1249 "r" (&c->redDither) | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1250 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1251 ); |
6578 | 1252 return; |
7723 | 1253 case IMGFMT_YUY2: |
1254 asm volatile( | |
9414 | 1255 "movl %%esp, "ESP_OFFSET"(%5) \n\t" |
1256 "movl %4, %%esp \n\t" | |
1257 YSCALEYUV2PACKED(%%eax, %5) | |
1258 WRITEYUY2(%%esp, 8280(%5), %%eax) | |
1259 "movl "ESP_OFFSET"(%5), %%esp \n\t" | |
1260 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1261 "r" (&c->redDither) | |
7723 | 1262 : "%eax" |
1263 ); | |
1264 return; | |
6578 | 1265 default: break; |
1266 } | |
1267 #endif //HAVE_MMX | |
7723 | 1268 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1269 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1270 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1271 /** |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1272 * YV12 to RGB without scaling or interpolating |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1273 */ |
7723 | 1274 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, |
6578 | 1275 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1276 { |
3344 | 1277 const int yalpha1=0; |
6578 | 1278 int i; |
1279 | |
1280 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1 | |
1281 const int yalpha= 4096; //FIXME ... | |
2671 | 1282 |
4467 | 1283 if(flags&SWS_FULL_CHR_H_INT) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1284 { |
7723 | 1285 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y); |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1286 return; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1287 } |
2576 | 1288 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1289 #ifdef HAVE_MMX |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1290 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1291 { |
6578 | 1292 switch(dstFormat) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1293 { |
6578 | 1294 case IMGFMT_BGR32: |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1295 asm volatile( |
9417 | 1296 "movl %%esp, "ESP_OFFSET"(%5) \n\t" |
1297 "movl %4, %%esp \n\t" | |
1298 YSCALEYUV2RGB1(%%eax, %5) | |
1299 WRITEBGR32(%%esp, 8280(%5), %%eax) | |
1300 "movl "ESP_OFFSET"(%5), %%esp \n\t" | |
1301 | |
1302 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1303 "r" (&c->redDither) | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1304 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1305 ); |
6578 | 1306 return; |
1307 case IMGFMT_BGR24: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1308 asm volatile( |
9417 | 1309 "movl %%esp, "ESP_OFFSET"(%5) \n\t" |
1310 "movl %4, %%esp \n\t" | |
1311 YSCALEYUV2RGB1(%%eax, %5) | |
1312 WRITEBGR24(%%esp, 8280(%5), %%eax) | |
1313 "movl "ESP_OFFSET"(%5), %%esp \n\t" | |
1314 | |
1315 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1316 "r" (&c->redDither) | |
1317 : "%eax" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1318 ); |
6578 | 1319 return; |
1320 case IMGFMT_BGR15: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1321 asm volatile( |
9417 | 1322 "movl %%esp, "ESP_OFFSET"(%5) \n\t" |
1323 "movl %4, %%esp \n\t" | |
1324 YSCALEYUV2RGB1(%%eax, %5) | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1325 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1326 #ifdef DITHER1XBPP |
4248 | 1327 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1328 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1329 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1330 #endif |
9417 | 1331 WRITEBGR15(%%esp, 8280(%5), %%eax) |
1332 "movl "ESP_OFFSET"(%5), %%esp \n\t" | |
1333 | |
1334 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1335 "r" (&c->redDither) | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1336 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1337 ); |
6578 | 1338 return; |
1339 case IMGFMT_BGR16: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1340 asm volatile( |
9417 | 1341 "movl %%esp, "ESP_OFFSET"(%5) \n\t" |
1342 "movl %4, %%esp \n\t" | |
1343 YSCALEYUV2RGB1(%%eax, %5) | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1344 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1345 #ifdef DITHER1XBPP |
4248 | 1346 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1347 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1348 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1349 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1350 |
9417 | 1351 WRITEBGR16(%%esp, 8280(%5), %%eax) |
1352 "movl "ESP_OFFSET"(%5), %%esp \n\t" | |
1353 | |
1354 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1355 "r" (&c->redDither) | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1356 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1357 ); |
6578 | 1358 return; |
7723 | 1359 case IMGFMT_YUY2: |
1360 asm volatile( | |
9417 | 1361 "movl %%esp, "ESP_OFFSET"(%5) \n\t" |
1362 "movl %4, %%esp \n\t" | |
1363 YSCALEYUV2PACKED1(%%eax, %5) | |
1364 WRITEYUY2(%%esp, 8280(%5), %%eax) | |
1365 "movl "ESP_OFFSET"(%5), %%esp \n\t" | |
1366 | |
1367 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1368 "r" (&c->redDither) | |
7723 | 1369 : "%eax" |
1370 ); | |
1371 return; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1372 } |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1373 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1374 else |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1375 { |
6578 | 1376 switch(dstFormat) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1377 { |
6578 | 1378 case IMGFMT_BGR32: |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1379 asm volatile( |
9417 | 1380 "movl %%esp, "ESP_OFFSET"(%5) \n\t" |
1381 "movl %4, %%esp \n\t" | |
1382 YSCALEYUV2RGB1b(%%eax, %5) | |
1383 WRITEBGR32(%%esp, 8280(%5), %%eax) | |
1384 "movl "ESP_OFFSET"(%5), %%esp \n\t" | |
1385 | |
1386 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1387 "r" (&c->redDither) | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1388 : "%eax" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1389 ); |
6578 | 1390 return; |
1391 case IMGFMT_BGR24: | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1392 asm volatile( |
9417 | 1393 "movl %%esp, "ESP_OFFSET"(%5) \n\t" |
1394 "movl %4, %%esp \n\t" | |
1395 YSCALEYUV2RGB1b(%%eax, %5) | |
1396 WRITEBGR24(%%esp, 8280(%5), %%eax) | |
1397 "movl "ESP_OFFSET"(%5), %%esp \n\t" | |
1398 | |
1399 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1400 "r" (&c->redDither) | |
1401 : "%eax" | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1402 ); |
6578 | 1403 return; |
1404 case IMGFMT_BGR15: | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1405 asm volatile( |
9417 | 1406 "movl %%esp, "ESP_OFFSET"(%5) \n\t" |
1407 "movl %4, %%esp \n\t" | |
1408 YSCALEYUV2RGB1b(%%eax, %5) | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1409 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1410 #ifdef DITHER1XBPP |
4248 | 1411 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1412 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1413 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1414 #endif |
9417 | 1415 WRITEBGR15(%%esp, 8280(%5), %%eax) |
1416 "movl "ESP_OFFSET"(%5), %%esp \n\t" | |
1417 | |
1418 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1419 "r" (&c->redDither) | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1420 : "%eax" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1421 ); |
6578 | 1422 return; |
1423 case IMGFMT_BGR16: | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1424 asm volatile( |
9417 | 1425 "movl %%esp, "ESP_OFFSET"(%5) \n\t" |
1426 "movl %4, %%esp \n\t" | |
1427 YSCALEYUV2RGB1b(%%eax, %5) | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1428 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1429 #ifdef DITHER1XBPP |
4248 | 1430 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1431 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1432 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1433 #endif |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1434 |
9417 | 1435 WRITEBGR16(%%esp, 8280(%5), %%eax) |
1436 "movl "ESP_OFFSET"(%5), %%esp \n\t" | |
1437 | |
1438 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1439 "r" (&c->redDither) | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1440 : "%eax" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1441 ); |
6578 | 1442 return; |
7723 | 1443 case IMGFMT_YUY2: |
1444 asm volatile( | |
9417 | 1445 "movl %%esp, "ESP_OFFSET"(%5) \n\t" |
1446 "movl %4, %%esp \n\t" | |
1447 YSCALEYUV2PACKED1b(%%eax, %5) | |
1448 WRITEYUY2(%%esp, 8280(%5), %%eax) | |
1449 "movl "ESP_OFFSET"(%5), %%esp \n\t" | |
1450 | |
1451 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1452 "r" (&c->redDither) | |
7723 | 1453 : "%eax" |
1454 ); | |
1455 return; | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1456 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1457 } |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1458 #endif |
6578 | 1459 if( uvalpha < 2048 ) |
1460 { | |
7723 | 1461 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C) |
6578 | 1462 }else{ |
7723 | 1463 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C) |
6578 | 1464 } |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1465 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1466 |
4481 | 1467 //FIXME yuy2* can read upto 7 samples to much |
1468 | |
4467 | 1469 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width) |
1470 { | |
4481 | 1471 #ifdef HAVE_MMX |
1472 asm volatile( | |
1473 "movq "MANGLE(bm01010101)", %%mm2\n\t" | |
1474 "movl %0, %%eax \n\t" | |
1475 "1: \n\t" | |
1476 "movq (%1, %%eax,2), %%mm0 \n\t" | |
1477 "movq 8(%1, %%eax,2), %%mm1 \n\t" | |
1478 "pand %%mm2, %%mm0 \n\t" | |
1479 "pand %%mm2, %%mm1 \n\t" | |
1480 "packuswb %%mm1, %%mm0 \n\t" | |
1481 "movq %%mm0, (%2, %%eax) \n\t" | |
1482 "addl $8, %%eax \n\t" | |
1483 " js 1b \n\t" | |
1484 : : "g" (-width), "r" (src+width*2), "r" (dst+width) | |
1485 : "%eax" | |
1486 ); | |
4467 | 1487 #else |
1488 int i; | |
1489 for(i=0; i<width; i++) | |
1490 dst[i]= src[2*i]; | |
1491 #endif | |
1492 } | |
1493 | |
1494 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1495 { | |
4481 | 1496 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1497 asm volatile( | |
1498 "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
1499 "movl %0, %%eax \n\t" | |
1500 "1: \n\t" | |
1501 "movq (%1, %%eax,4), %%mm0 \n\t" | |
1502 "movq 8(%1, %%eax,4), %%mm1 \n\t" | |
1503 "movq (%2, %%eax,4), %%mm2 \n\t" | |
1504 "movq 8(%2, %%eax,4), %%mm3 \n\t" | |
1505 PAVGB(%%mm2, %%mm0) | |
1506 PAVGB(%%mm3, %%mm1) | |
1507 "psrlw $8, %%mm0 \n\t" | |
1508 "psrlw $8, %%mm1 \n\t" | |
1509 "packuswb %%mm1, %%mm0 \n\t" | |
1510 "movq %%mm0, %%mm1 \n\t" | |
1511 "psrlw $8, %%mm0 \n\t" | |
1512 "pand %%mm4, %%mm1 \n\t" | |
1513 "packuswb %%mm0, %%mm0 \n\t" | |
1514 "packuswb %%mm1, %%mm1 \n\t" | |
1515 "movd %%mm0, (%4, %%eax) \n\t" | |
1516 "movd %%mm1, (%3, %%eax) \n\t" | |
1517 "addl $4, %%eax \n\t" | |
1518 " js 1b \n\t" | |
1519 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width) | |
1520 : "%eax" | |
1521 ); | |
4467 | 1522 #else |
1523 int i; | |
1524 for(i=0; i<width; i++) | |
1525 { | |
1526 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1; | |
1527 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1; | |
1528 } | |
1529 #endif | |
1530 } | |
1531 | |
9071 | 1532 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses |
1533 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width) | |
1534 { | |
1535 #ifdef HAVE_MMX | |
1536 asm volatile( | |
1537 "movl %0, %%eax \n\t" | |
1538 "1: \n\t" | |
1539 "movq (%1, %%eax,2), %%mm0 \n\t" | |
1540 "movq 8(%1, %%eax,2), %%mm1 \n\t" | |
1541 "psrlw $8, %%mm0 \n\t" | |
1542 "psrlw $8, %%mm1 \n\t" | |
1543 "packuswb %%mm1, %%mm0 \n\t" | |
1544 "movq %%mm0, (%2, %%eax) \n\t" | |
1545 "addl $8, %%eax \n\t" | |
1546 " js 1b \n\t" | |
1547 : : "g" (-width), "r" (src+width*2), "r" (dst+width) | |
1548 : "%eax" | |
1549 ); | |
1550 #else | |
1551 int i; | |
1552 for(i=0; i<width; i++) | |
1553 dst[i]= src[2*i+1]; | |
1554 #endif | |
1555 } | |
1556 | |
1557 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1558 { | |
1559 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1560 asm volatile( | |
1561 "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
1562 "movl %0, %%eax \n\t" | |
1563 "1: \n\t" | |
1564 "movq (%1, %%eax,4), %%mm0 \n\t" | |
1565 "movq 8(%1, %%eax,4), %%mm1 \n\t" | |
1566 "movq (%2, %%eax,4), %%mm2 \n\t" | |
1567 "movq 8(%2, %%eax,4), %%mm3 \n\t" | |
1568 PAVGB(%%mm2, %%mm0) | |
1569 PAVGB(%%mm3, %%mm1) | |
1570 "pand %%mm4, %%mm0 \n\t" | |
1571 "pand %%mm4, %%mm1 \n\t" | |
1572 "packuswb %%mm1, %%mm0 \n\t" | |
1573 "movq %%mm0, %%mm1 \n\t" | |
1574 "psrlw $8, %%mm0 \n\t" | |
1575 "pand %%mm4, %%mm1 \n\t" | |
1576 "packuswb %%mm0, %%mm0 \n\t" | |
1577 "packuswb %%mm1, %%mm1 \n\t" | |
1578 "movd %%mm0, (%4, %%eax) \n\t" | |
1579 "movd %%mm1, (%3, %%eax) \n\t" | |
1580 "addl $4, %%eax \n\t" | |
1581 " js 1b \n\t" | |
1582 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width) | |
1583 : "%eax" | |
1584 ); | |
1585 #else | |
1586 int i; | |
1587 for(i=0; i<width; i++) | |
1588 { | |
1589 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1; | |
1590 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1; | |
1591 } | |
1592 #endif | |
1593 } | |
1594 | |
4467 | 1595 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width) |
1596 { | |
1597 #ifdef HAVE_MMXFIXME | |
1598 #else | |
1599 int i; | |
1600 for(i=0; i<width; i++) | |
1601 { | |
9433 | 1602 int b= ((uint32_t*)src)[i]&0xFF; |
1603 int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
9499 | 1604 int r= (((uint32_t*)src)[i]>>16)&0xFF; |
4467 | 1605 |
9433 | 1606 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); |
4467 | 1607 } |
1608 #endif | |
1609 } | |
1610 | |
1611 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1612 { | |
1613 #ifdef HAVE_MMXFIXME | |
1614 #else | |
1615 int i; | |
1616 for(i=0; i<width; i++) | |
1617 { | |
9433 | 1618 const int a= ((uint32_t*)src1)[2*i+0]; |
1619 const int e= ((uint32_t*)src1)[2*i+1]; | |
1620 const int c= ((uint32_t*)src2)[2*i+0]; | |
1621 const int d= ((uint32_t*)src2)[2*i+1]; | |
1622 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF); | |
1623 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00); | |
1624 const int b= l&0x3FF; | |
1625 const int g= h>>8; | |
1626 const int r= l>>16; | |
4467 | 1627 |
1628 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1629 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1630 } | |
1631 #endif | |
1632 } | |
1633 | |
1634 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width) | |
1635 { | |
4612 | 1636 #ifdef HAVE_MMX |
1637 asm volatile( | |
1638 "movl %2, %%eax \n\t" | |
4923 | 1639 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" |
1640 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
4612 | 1641 "pxor %%mm7, %%mm7 \n\t" |
1642 "leal (%%eax, %%eax, 2), %%ebx \n\t" | |
1643 ".balign 16 \n\t" | |
1644 "1: \n\t" | |
1645 PREFETCH" 64(%0, %%ebx) \n\t" | |
1646 "movd (%0, %%ebx), %%mm0 \n\t" | |
1647 "movd 3(%0, %%ebx), %%mm1 \n\t" | |
1648 "punpcklbw %%mm7, %%mm0 \n\t" | |
1649 "punpcklbw %%mm7, %%mm1 \n\t" | |
1650 "movd 6(%0, %%ebx), %%mm2 \n\t" | |
1651 "movd 9(%0, %%ebx), %%mm3 \n\t" | |
1652 "punpcklbw %%mm7, %%mm2 \n\t" | |
1653 "punpcklbw %%mm7, %%mm3 \n\t" | |
1654 "pmaddwd %%mm6, %%mm0 \n\t" | |
1655 "pmaddwd %%mm6, %%mm1 \n\t" | |
1656 "pmaddwd %%mm6, %%mm2 \n\t" | |
1657 "pmaddwd %%mm6, %%mm3 \n\t" | |
1658 #ifndef FAST_BGR2YV12 | |
1659 "psrad $8, %%mm0 \n\t" | |
1660 "psrad $8, %%mm1 \n\t" | |
1661 "psrad $8, %%mm2 \n\t" | |
1662 "psrad $8, %%mm3 \n\t" | |
1663 #endif | |
1664 "packssdw %%mm1, %%mm0 \n\t" | |
1665 "packssdw %%mm3, %%mm2 \n\t" | |
1666 "pmaddwd %%mm5, %%mm0 \n\t" | |
1667 "pmaddwd %%mm5, %%mm2 \n\t" | |
1668 "packssdw %%mm2, %%mm0 \n\t" | |
1669 "psraw $7, %%mm0 \n\t" | |
1670 | |
1671 "movd 12(%0, %%ebx), %%mm4 \n\t" | |
1672 "movd 15(%0, %%ebx), %%mm1 \n\t" | |
1673 "punpcklbw %%mm7, %%mm4 \n\t" | |
1674 "punpcklbw %%mm7, %%mm1 \n\t" | |
1675 "movd 18(%0, %%ebx), %%mm2 \n\t" | |
1676 "movd 21(%0, %%ebx), %%mm3 \n\t" | |
1677 "punpcklbw %%mm7, %%mm2 \n\t" | |
1678 "punpcklbw %%mm7, %%mm3 \n\t" | |
1679 "pmaddwd %%mm6, %%mm4 \n\t" | |
1680 "pmaddwd %%mm6, %%mm1 \n\t" | |
1681 "pmaddwd %%mm6, %%mm2 \n\t" | |
1682 "pmaddwd %%mm6, %%mm3 \n\t" | |
1683 #ifndef FAST_BGR2YV12 | |
1684 "psrad $8, %%mm4 \n\t" | |
1685 "psrad $8, %%mm1 \n\t" | |
1686 "psrad $8, %%mm2 \n\t" | |
1687 "psrad $8, %%mm3 \n\t" | |
1688 #endif | |
1689 "packssdw %%mm1, %%mm4 \n\t" | |
1690 "packssdw %%mm3, %%mm2 \n\t" | |
1691 "pmaddwd %%mm5, %%mm4 \n\t" | |
1692 "pmaddwd %%mm5, %%mm2 \n\t" | |
1693 "addl $24, %%ebx \n\t" | |
1694 "packssdw %%mm2, %%mm4 \n\t" | |
1695 "psraw $7, %%mm4 \n\t" | |
1696 | |
1697 "packuswb %%mm4, %%mm0 \n\t" | |
4923 | 1698 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" |
4612 | 1699 |
4619 | 1700 "movq %%mm0, (%1, %%eax) \n\t" |
4612 | 1701 "addl $8, %%eax \n\t" |
1702 " js 1b \n\t" | |
1703 : : "r" (src+width*3), "r" (dst+width), "g" (-width) | |
1704 : "%eax", "%ebx" | |
1705 ); | |
4467 | 1706 #else |
1707 int i; | |
1708 for(i=0; i<width; i++) | |
1709 { | |
1710 int b= src[i*3+0]; | |
1711 int g= src[i*3+1]; | |
1712 int r= src[i*3+2]; | |
1713 | |
9434 | 1714 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); |
4467 | 1715 } |
1716 #endif | |
1717 } | |
1718 | |
1719 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1720 { | |
4619 | 1721 #ifdef HAVE_MMX |
1722 asm volatile( | |
1723 "movl %4, %%eax \n\t" | |
4923 | 1724 "movq "MANGLE(w1111)", %%mm5 \n\t" |
1725 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" | |
4619 | 1726 "pxor %%mm7, %%mm7 \n\t" |
1727 "leal (%%eax, %%eax, 2), %%ebx \n\t" | |
1728 "addl %%ebx, %%ebx \n\t" | |
1729 ".balign 16 \n\t" | |
1730 "1: \n\t" | |
1731 PREFETCH" 64(%0, %%ebx) \n\t" | |
1732 PREFETCH" 64(%1, %%ebx) \n\t" | |
1733 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1734 "movq (%0, %%ebx), %%mm0 \n\t" | |
1735 "movq (%1, %%ebx), %%mm1 \n\t" | |
1736 "movq 6(%0, %%ebx), %%mm2 \n\t" | |
1737 "movq 6(%1, %%ebx), %%mm3 \n\t" | |
1738 PAVGB(%%mm1, %%mm0) | |
1739 PAVGB(%%mm3, %%mm2) | |
1740 "movq %%mm0, %%mm1 \n\t" | |
1741 "movq %%mm2, %%mm3 \n\t" | |
1742 "psrlq $24, %%mm0 \n\t" | |
1743 "psrlq $24, %%mm2 \n\t" | |
1744 PAVGB(%%mm1, %%mm0) | |
1745 PAVGB(%%mm3, %%mm2) | |
1746 "punpcklbw %%mm7, %%mm0 \n\t" | |
1747 "punpcklbw %%mm7, %%mm2 \n\t" | |
1748 #else | |
1749 "movd (%0, %%ebx), %%mm0 \n\t" | |
1750 "movd (%1, %%ebx), %%mm1 \n\t" | |
1751 "movd 3(%0, %%ebx), %%mm2 \n\t" | |
1752 "movd 3(%1, %%ebx), %%mm3 \n\t" | |
1753 "punpcklbw %%mm7, %%mm0 \n\t" | |
1754 "punpcklbw %%mm7, %%mm1 \n\t" | |
1755 "punpcklbw %%mm7, %%mm2 \n\t" | |
1756 "punpcklbw %%mm7, %%mm3 \n\t" | |
1757 "paddw %%mm1, %%mm0 \n\t" | |
1758 "paddw %%mm3, %%mm2 \n\t" | |
1759 "paddw %%mm2, %%mm0 \n\t" | |
1760 "movd 6(%0, %%ebx), %%mm4 \n\t" | |
1761 "movd 6(%1, %%ebx), %%mm1 \n\t" | |
1762 "movd 9(%0, %%ebx), %%mm2 \n\t" | |
1763 "movd 9(%1, %%ebx), %%mm3 \n\t" | |
1764 "punpcklbw %%mm7, %%mm4 \n\t" | |
1765 "punpcklbw %%mm7, %%mm1 \n\t" | |
1766 "punpcklbw %%mm7, %%mm2 \n\t" | |
1767 "punpcklbw %%mm7, %%mm3 \n\t" | |
1768 "paddw %%mm1, %%mm4 \n\t" | |
1769 "paddw %%mm3, %%mm2 \n\t" | |
1770 "paddw %%mm4, %%mm2 \n\t" | |
1771 "psrlw $2, %%mm0 \n\t" | |
1772 "psrlw $2, %%mm2 \n\t" | |
1773 #endif | |
4923 | 1774 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
1775 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4619 | 1776 |
1777 "pmaddwd %%mm0, %%mm1 \n\t" | |
1778 "pmaddwd %%mm2, %%mm3 \n\t" | |
1779 "pmaddwd %%mm6, %%mm0 \n\t" | |
1780 "pmaddwd %%mm6, %%mm2 \n\t" | |
1781 #ifndef FAST_BGR2YV12 | |
1782 "psrad $8, %%mm0 \n\t" | |
1783 "psrad $8, %%mm1 \n\t" | |
1784 "psrad $8, %%mm2 \n\t" | |
1785 "psrad $8, %%mm3 \n\t" | |
1786 #endif | |
1787 "packssdw %%mm2, %%mm0 \n\t" | |
1788 "packssdw %%mm3, %%mm1 \n\t" | |
1789 "pmaddwd %%mm5, %%mm0 \n\t" | |
1790 "pmaddwd %%mm5, %%mm1 \n\t" | |
1791 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
1792 "psraw $7, %%mm0 \n\t" | |
1793 | |
1794 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1795 "movq 12(%0, %%ebx), %%mm4 \n\t" | |
1796 "movq 12(%1, %%ebx), %%mm1 \n\t" | |
1797 "movq 18(%0, %%ebx), %%mm2 \n\t" | |
1798 "movq 18(%1, %%ebx), %%mm3 \n\t" | |
1799 PAVGB(%%mm1, %%mm4) | |
1800 PAVGB(%%mm3, %%mm2) | |
1801 "movq %%mm4, %%mm1 \n\t" | |
1802 "movq %%mm2, %%mm3 \n\t" | |
1803 "psrlq $24, %%mm4 \n\t" | |
1804 "psrlq $24, %%mm2 \n\t" | |
1805 PAVGB(%%mm1, %%mm4) | |
1806 PAVGB(%%mm3, %%mm2) | |
1807 "punpcklbw %%mm7, %%mm4 \n\t" | |
1808 "punpcklbw %%mm7, %%mm2 \n\t" | |
1809 #else | |
1810 "movd 12(%0, %%ebx), %%mm4 \n\t" | |
1811 "movd 12(%1, %%ebx), %%mm1 \n\t" | |
1812 "movd 15(%0, %%ebx), %%mm2 \n\t" | |
1813 "movd 15(%1, %%ebx), %%mm3 \n\t" | |
1814 "punpcklbw %%mm7, %%mm4 \n\t" | |
1815 "punpcklbw %%mm7, %%mm1 \n\t" | |
1816 "punpcklbw %%mm7, %%mm2 \n\t" | |
1817 "punpcklbw %%mm7, %%mm3 \n\t" | |
1818 "paddw %%mm1, %%mm4 \n\t" | |
1819 "paddw %%mm3, %%mm2 \n\t" | |
1820 "paddw %%mm2, %%mm4 \n\t" | |
1821 "movd 18(%0, %%ebx), %%mm5 \n\t" | |
1822 "movd 18(%1, %%ebx), %%mm1 \n\t" | |
1823 "movd 21(%0, %%ebx), %%mm2 \n\t" | |
1824 "movd 21(%1, %%ebx), %%mm3 \n\t" | |
1825 "punpcklbw %%mm7, %%mm5 \n\t" | |
1826 "punpcklbw %%mm7, %%mm1 \n\t" | |
1827 "punpcklbw %%mm7, %%mm2 \n\t" | |
1828 "punpcklbw %%mm7, %%mm3 \n\t" | |
1829 "paddw %%mm1, %%mm5 \n\t" | |
1830 "paddw %%mm3, %%mm2 \n\t" | |
1831 "paddw %%mm5, %%mm2 \n\t" | |
4923 | 1832 "movq "MANGLE(w1111)", %%mm5 \n\t" |
4619 | 1833 "psrlw $2, %%mm4 \n\t" |
1834 "psrlw $2, %%mm2 \n\t" | |
1835 #endif | |
4923 | 1836 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
1837 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4619 | 1838 |
1839 "pmaddwd %%mm4, %%mm1 \n\t" | |
1840 "pmaddwd %%mm2, %%mm3 \n\t" | |
1841 "pmaddwd %%mm6, %%mm4 \n\t" | |
1842 "pmaddwd %%mm6, %%mm2 \n\t" | |
1843 #ifndef FAST_BGR2YV12 | |
1844 "psrad $8, %%mm4 \n\t" | |
1845 "psrad $8, %%mm1 \n\t" | |
1846 "psrad $8, %%mm2 \n\t" | |
1847 "psrad $8, %%mm3 \n\t" | |
1848 #endif | |
1849 "packssdw %%mm2, %%mm4 \n\t" | |
1850 "packssdw %%mm3, %%mm1 \n\t" | |
1851 "pmaddwd %%mm5, %%mm4 \n\t" | |
1852 "pmaddwd %%mm5, %%mm1 \n\t" | |
1853 "addl $24, %%ebx \n\t" | |
1854 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 | |
1855 "psraw $7, %%mm4 \n\t" | |
1856 | |
1857 "movq %%mm0, %%mm1 \n\t" | |
1858 "punpckldq %%mm4, %%mm0 \n\t" | |
1859 "punpckhdq %%mm4, %%mm1 \n\t" | |
1860 "packsswb %%mm1, %%mm0 \n\t" | |
4923 | 1861 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" |
4619 | 1862 |
1863 "movd %%mm0, (%2, %%eax) \n\t" | |
1864 "punpckhdq %%mm0, %%mm0 \n\t" | |
1865 "movd %%mm0, (%3, %%eax) \n\t" | |
1866 "addl $4, %%eax \n\t" | |
1867 " js 1b \n\t" | |
1868 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width) | |
1869 : "%eax", "%ebx" | |
1870 ); | |
4467 | 1871 #else |
1872 int i; | |
1873 for(i=0; i<width; i++) | |
1874 { | |
1875 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3]; | |
1876 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4]; | |
1877 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5]; | |
1878 | |
1879 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1880 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1881 } | |
1882 #endif | |
1883 } | |
1884 | |
4578 | 1885 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width) |
1886 { | |
1887 int i; | |
1888 for(i=0; i<width; i++) | |
1889 { | |
9433 | 1890 int d= ((uint16_t*)src)[i]; |
4578 | 1891 int b= d&0x1F; |
1892 int g= (d>>5)&0x3F; | |
1893 int r= (d>>11)&0x1F; | |
1894 | |
1895 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16; | |
1896 } | |
1897 } | |
1898 | |
1899 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1900 { | |
1901 int i; | |
1902 for(i=0; i<width; i++) | |
1903 { | |
9433 | 1904 int d0= ((uint32_t*)src1)[i]; |
1905 int d1= ((uint32_t*)src2)[i]; | |
4579 | 1906 |
1907 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F); | |
1908 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F); | |
1909 | |
1910 int dh2= (dh>>11) + (dh<<21); | |
1911 int d= dh2 + dl; | |
1912 | |
1913 int b= d&0x7F; | |
1914 int r= (d>>11)&0x7F; | |
1915 int g= d>>21; | |
4578 | 1916 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128; |
1917 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128; | |
1918 } | |
1919 } | |
1920 | |
4580 | 1921 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width) |
1922 { | |
1923 int i; | |
1924 for(i=0; i<width; i++) | |
1925 { | |
9433 | 1926 int d= ((uint16_t*)src)[i]; |
4580 | 1927 int b= d&0x1F; |
1928 int g= (d>>5)&0x1F; | |
1929 int r= (d>>10)&0x1F; | |
1930 | |
1931 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16; | |
1932 } | |
1933 } | |
1934 | |
1935 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1936 { | |
1937 int i; | |
1938 for(i=0; i<width; i++) | |
1939 { | |
9433 | 1940 int d0= ((uint32_t*)src1)[i]; |
1941 int d1= ((uint32_t*)src2)[i]; | |
4580 | 1942 |
1943 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F); | |
1944 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F); | |
1945 | |
1946 int dh2= (dh>>11) + (dh<<21); | |
1947 int d= dh2 + dl; | |
1948 | |
1949 int b= d&0x7F; | |
1950 int r= (d>>10)&0x7F; | |
1951 int g= d>>21; | |
1952 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128; | |
1953 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128; | |
1954 } | |
1955 } | |
1956 | |
1957 | |
4558 | 1958 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width) |
1959 { | |
1960 int i; | |
1961 for(i=0; i<width; i++) | |
1962 { | |
9433 | 1963 int r= ((uint32_t*)src)[i]&0xFF; |
1964 int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
9499 | 1965 int b= (((uint32_t*)src)[i]>>16)&0xFF; |
4558 | 1966 |
9433 | 1967 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); |
4558 | 1968 } |
1969 } | |
1970 | |
1971 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1972 { | |
1973 int i; | |
1974 for(i=0; i<width; i++) | |
1975 { | |
9433 | 1976 const int a= ((uint32_t*)src1)[2*i+0]; |
1977 const int e= ((uint32_t*)src1)[2*i+1]; | |
1978 const int c= ((uint32_t*)src2)[2*i+0]; | |
1979 const int d= ((uint32_t*)src2)[2*i+1]; | |
1980 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF); | |
1981 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00); | |
1982 const int r= l&0x3FF; | |
1983 const int g= h>>8; | |
1984 const int b= l>>16; | |
4558 | 1985 |
1986 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1987 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1988 } | |
1989 } | |
1990 | |
1991 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width) | |
1992 { | |
1993 int i; | |
1994 for(i=0; i<width; i++) | |
1995 { | |
1996 int r= src[i*3+0]; | |
1997 int g= src[i*3+1]; | |
1998 int b= src[i*3+2]; | |
1999 | |
9433 | 2000 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); |
4558 | 2001 } |
2002 } | |
2003 | |
2004 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2005 { | |
2006 int i; | |
2007 for(i=0; i<width; i++) | |
2008 { | |
2009 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3]; | |
2010 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4]; | |
2011 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5]; | |
2012 | |
2013 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2014 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2015 } | |
2016 } | |
2017 | |
4467 | 2018 |
3272 | 2019 // Bilinear / Bicubic scaling |
2020 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, | |
2021 int16_t *filter, int16_t *filterPos, int filterSize) | |
2022 { | |
2023 #ifdef HAVE_MMX | |
9921
61057de81510
mplayer idependant (not really yet) swscale example
michael
parents:
9499
diff
changeset
|
2024 assert(filterSize % 4 == 0 && filterSize>0); |
3272 | 2025 if(filterSize==4) // allways true for upscaling, sometimes for down too |
2026 { | |
2027 int counter= -2*dstW; | |
2028 filter-= counter*2; | |
2029 filterPos-= counter/2; | |
2030 dst-= counter/2; | |
2031 asm volatile( | |
2032 "pxor %%mm7, %%mm7 \n\t" | |
4248 | 2033 "movq "MANGLE(w02)", %%mm6 \n\t" |
3272 | 2034 "pushl %%ebp \n\t" // we use 7 regs here ... |
2035 "movl %%eax, %%ebp \n\t" | |
2036 ".balign 16 \n\t" | |
2037 "1: \n\t" | |
2038 "movzwl (%2, %%ebp), %%eax \n\t" | |
2039 "movzwl 2(%2, %%ebp), %%ebx \n\t" | |
2040 "movq (%1, %%ebp, 4), %%mm1 \n\t" | |
2041 "movq 8(%1, %%ebp, 4), %%mm3 \n\t" | |
2042 "movd (%3, %%eax), %%mm0 \n\t" | |
2043 "movd (%3, %%ebx), %%mm2 \n\t" | |
2044 "punpcklbw %%mm7, %%mm0 \n\t" | |
2045 "punpcklbw %%mm7, %%mm2 \n\t" | |
2046 "pmaddwd %%mm1, %%mm0 \n\t" | |
2047 "pmaddwd %%mm2, %%mm3 \n\t" | |
2048 "psrad $8, %%mm0 \n\t" | |
2049 "psrad $8, %%mm3 \n\t" | |
2050 "packssdw %%mm3, %%mm0 \n\t" | |
2051 "pmaddwd %%mm6, %%mm0 \n\t" | |
2052 "packssdw %%mm0, %%mm0 \n\t" | |
2053 "movd %%mm0, (%4, %%ebp) \n\t" | |
2054 "addl $4, %%ebp \n\t" | |
2055 " jnc 1b \n\t" | |
3352 | 2056 |
3272 | 2057 "popl %%ebp \n\t" |
2058 : "+a" (counter) | |
2059 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
2060 : "%ebx" | |
2061 ); | |
2062 } | |
2063 else if(filterSize==8) | |
2064 { | |
2065 int counter= -2*dstW; | |
2066 filter-= counter*4; | |
2067 filterPos-= counter/2; | |
2068 dst-= counter/2; | |
2069 asm volatile( | |
2070 "pxor %%mm7, %%mm7 \n\t" | |
4248 | 2071 "movq "MANGLE(w02)", %%mm6 \n\t" |
3272 | 2072 "pushl %%ebp \n\t" // we use 7 regs here ... |
2073 "movl %%eax, %%ebp \n\t" | |
2074 ".balign 16 \n\t" | |
2075 "1: \n\t" | |
2076 "movzwl (%2, %%ebp), %%eax \n\t" | |
2077 "movzwl 2(%2, %%ebp), %%ebx \n\t" | |
2078 "movq (%1, %%ebp, 8), %%mm1 \n\t" | |
2079 "movq 16(%1, %%ebp, 8), %%mm3 \n\t" | |
2080 "movd (%3, %%eax), %%mm0 \n\t" | |
2081 "movd (%3, %%ebx), %%mm2 \n\t" | |
2082 "punpcklbw %%mm7, %%mm0 \n\t" | |
2083 "punpcklbw %%mm7, %%mm2 \n\t" | |
2084 "pmaddwd %%mm1, %%mm0 \n\t" | |
2085 "pmaddwd %%mm2, %%mm3 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
2086 |
3272 | 2087 "movq 8(%1, %%ebp, 8), %%mm1 \n\t" |
2088 "movq 24(%1, %%ebp, 8), %%mm5 \n\t" | |
2089 "movd 4(%3, %%eax), %%mm4 \n\t" | |
2090 "movd 4(%3, %%ebx), %%mm2 \n\t" | |
2091 "punpcklbw %%mm7, %%mm4 \n\t" | |
2092 "punpcklbw %%mm7, %%mm2 \n\t" | |
2093 "pmaddwd %%mm1, %%mm4 \n\t" | |
2094 "pmaddwd %%mm2, %%mm5 \n\t" | |
2095 "paddd %%mm4, %%mm0 \n\t" | |
2096 "paddd %%mm5, %%mm3 \n\t" | |
2097 | |
2098 "psrad $8, %%mm0 \n\t" | |
2099 "psrad $8, %%mm3 \n\t" | |
2100 "packssdw %%mm3, %%mm0 \n\t" | |
2101 "pmaddwd %%mm6, %%mm0 \n\t" | |
2102 "packssdw %%mm0, %%mm0 \n\t" | |
2103 "movd %%mm0, (%4, %%ebp) \n\t" | |
2104 "addl $4, %%ebp \n\t" | |
2105 " jnc 1b \n\t" | |
3344 | 2106 |
3272 | 2107 "popl %%ebp \n\t" |
2108 : "+a" (counter) | |
2109 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
2110 : "%ebx" | |
2111 ); | |
2112 } | |
2113 else | |
2114 { | |
2115 int counter= -2*dstW; | |
2116 // filter-= counter*filterSize/2; | |
2117 filterPos-= counter/2; | |
2118 dst-= counter/2; | |
2119 asm volatile( | |
2120 "pxor %%mm7, %%mm7 \n\t" | |
4248 | 2121 "movq "MANGLE(w02)", %%mm6 \n\t" |
3272 | 2122 ".balign 16 \n\t" |
2123 "1: \n\t" | |
2124 "movl %2, %%ecx \n\t" | |
2125 "movzwl (%%ecx, %0), %%eax \n\t" | |
2126 "movzwl 2(%%ecx, %0), %%ebx \n\t" | |
2127 "movl %5, %%ecx \n\t" | |
2128 "pxor %%mm4, %%mm4 \n\t" | |
2129 "pxor %%mm5, %%mm5 \n\t" | |
2130 "2: \n\t" | |
2131 "movq (%1), %%mm1 \n\t" | |
2132 "movq (%1, %6), %%mm3 \n\t" | |
2133 "movd (%%ecx, %%eax), %%mm0 \n\t" | |
2134 "movd (%%ecx, %%ebx), %%mm2 \n\t" | |
2135 "punpcklbw %%mm7, %%mm0 \n\t" | |
2136 "punpcklbw %%mm7, %%mm2 \n\t" | |
2137 "pmaddwd %%mm1, %%mm0 \n\t" | |
2138 "pmaddwd %%mm2, %%mm3 \n\t" | |
2139 "paddd %%mm3, %%mm5 \n\t" | |
2140 "paddd %%mm0, %%mm4 \n\t" | |
2141 "addl $8, %1 \n\t" | |
2142 "addl $4, %%ecx \n\t" | |
2143 "cmpl %4, %%ecx \n\t" | |
2144 " jb 2b \n\t" | |
2145 "addl %6, %1 \n\t" | |
2146 "psrad $8, %%mm4 \n\t" | |
2147 "psrad $8, %%mm5 \n\t" | |
2148 "packssdw %%mm5, %%mm4 \n\t" | |
2149 "pmaddwd %%mm6, %%mm4 \n\t" | |
2150 "packssdw %%mm4, %%mm4 \n\t" | |
2151 "movl %3, %%eax \n\t" | |
2152 "movd %%mm4, (%%eax, %0) \n\t" | |
2153 "addl $4, %0 \n\t" | |
2154 " jnc 1b \n\t" | |
3344 | 2155 |
3641 | 2156 : "+r" (counter), "+r" (filter) |
2157 : "m" (filterPos), "m" (dst), "m"(src+filterSize), | |
3272 | 2158 "m" (src), "r" (filterSize*2) |
3299 | 2159 : "%ebx", "%eax", "%ecx" |
3272 | 2160 ); |
2161 } | |
2162 #else | |
12130
2ef24558b732
AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michael
parents:
12017
diff
changeset
|
2163 #ifdef HAVE_ALTIVEC |
2ef24558b732
AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michael
parents:
12017
diff
changeset
|
2164 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize); |
2ef24558b732
AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michael
parents:
12017
diff
changeset
|
2165 #else |
3272 | 2166 int i; |
2167 for(i=0; i<dstW; i++) | |
2168 { | |
2169 int j; | |
2170 int srcPos= filterPos[i]; | |
2171 int val=0; | |
3344 | 2172 // printf("filterPos: %d\n", filterPos[i]); |
3272 | 2173 for(j=0; j<filterSize; j++) |
2174 { | |
2175 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); | |
2176 val += ((int)src[srcPos + j])*filter[filterSize*i + j]; | |
2177 } | |
2178 // filter += hFilterSize; | |
2179 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ... | |
2180 // dst[i] = val>>7; | |
2181 } | |
2182 #endif | |
12130
2ef24558b732
AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michael
parents:
12017
diff
changeset
|
2183 #endif |
3272 | 2184 } |
2185 // *** horizontal scale Y line to temp buffer | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2186 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc, |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2187 int flags, int canMMX2BeUsed, int16_t *hLumFilter, |
4467 | 2188 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, |
5452 | 2189 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, |
2190 int32_t *mmx2FilterPos) | |
2469 | 2191 { |
4467 | 2192 if(srcFormat==IMGFMT_YUY2) |
2193 { | |
2194 RENAME(yuy2ToY)(formatConvBuffer, src, srcW); | |
2195 src= formatConvBuffer; | |
2196 } | |
9071 | 2197 else if(srcFormat==IMGFMT_UYVY) |
2198 { | |
2199 RENAME(uyvyToY)(formatConvBuffer, src, srcW); | |
2200 src= formatConvBuffer; | |
2201 } | |
4467 | 2202 else if(srcFormat==IMGFMT_BGR32) |
2203 { | |
2204 RENAME(bgr32ToY)(formatConvBuffer, src, srcW); | |
2205 src= formatConvBuffer; | |
2206 } | |
2207 else if(srcFormat==IMGFMT_BGR24) | |
2208 { | |
2209 RENAME(bgr24ToY)(formatConvBuffer, src, srcW); | |
2210 src= formatConvBuffer; | |
2211 } | |
4578 | 2212 else if(srcFormat==IMGFMT_BGR16) |
2213 { | |
2214 RENAME(bgr16ToY)(formatConvBuffer, src, srcW); | |
2215 src= formatConvBuffer; | |
2216 } | |
4580 | 2217 else if(srcFormat==IMGFMT_BGR15) |
2218 { | |
2219 RENAME(bgr15ToY)(formatConvBuffer, src, srcW); | |
2220 src= formatConvBuffer; | |
2221 } | |
4558 | 2222 else if(srcFormat==IMGFMT_RGB32) |
2223 { | |
2224 RENAME(rgb32ToY)(formatConvBuffer, src, srcW); | |
2225 src= formatConvBuffer; | |
2226 } | |
2227 else if(srcFormat==IMGFMT_RGB24) | |
2228 { | |
2229 RENAME(rgb24ToY)(formatConvBuffer, src, srcW); | |
2230 src= formatConvBuffer; | |
2231 } | |
4467 | 2232 |
3352 | 2233 #ifdef HAVE_MMX |
11000 | 2234 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one) |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2235 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) |
3352 | 2236 #else |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2237 if(!(flags&SWS_FAST_BILINEAR)) |
3352 | 2238 #endif |
3272 | 2239 { |
2240 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); | |
2241 } | |
2242 else // Fast Bilinear upscale / crap downscale | |
2243 { | |
2469 | 2244 #ifdef ARCH_X86 |
2245 #ifdef HAVE_MMX2 | |
2671 | 2246 int i; |
2469 | 2247 if(canMMX2BeUsed) |
2248 { | |
2249 asm volatile( | |
2250 "pxor %%mm7, %%mm7 \n\t" | |
5452 | 2251 "movl %0, %%ecx \n\t" |
2252 "movl %1, %%edi \n\t" | |
2253 "movl %2, %%edx \n\t" | |
2254 "movl %3, %%ebx \n\t" | |
2469 | 2255 "xorl %%eax, %%eax \n\t" // i |
5452 | 2256 PREFETCH" (%%ecx) \n\t" |
2257 PREFETCH" 32(%%ecx) \n\t" | |
2258 PREFETCH" 64(%%ecx) \n\t" | |
2520 | 2259 |
2469 | 2260 #define FUNNY_Y_CODE \ |
5452 | 2261 "movl (%%ebx), %%esi \n\t"\ |
2262 "call *%4 \n\t"\ | |
2263 "addl (%%ebx, %%eax), %%ecx \n\t"\ | |
2264 "addl %%eax, %%edi \n\t"\ | |
2265 "xorl %%eax, %%eax \n\t"\ | |
2520 | 2266 |
2469 | 2267 FUNNY_Y_CODE |
2268 FUNNY_Y_CODE | |
2269 FUNNY_Y_CODE | |
2270 FUNNY_Y_CODE | |
2271 FUNNY_Y_CODE | |
2272 FUNNY_Y_CODE | |
2273 FUNNY_Y_CODE | |
2274 FUNNY_Y_CODE | |
2275 | |
5452 | 2276 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2277 "m" (funnyYCode) | |
2469 | 2278 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" |
2279 ); | |
3215 | 2280 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; |
2469 | 2281 } |
2282 else | |
2283 { | |
2284 #endif | |
2285 //NO MMX just normal asm ... | |
2286 asm volatile( | |
2287 "xorl %%eax, %%eax \n\t" // i | |
2288 "xorl %%ebx, %%ebx \n\t" // xx | |
2289 "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
2290 ".balign 16 \n\t" |
2469 | 2291 "1: \n\t" |
2292 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |
2293 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |
2294 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2295 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2296 "shll $16, %%edi \n\t" | |
2297 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2298 "movl %1, %%edi \n\t" | |
2299 "shrl $9, %%esi \n\t" | |
2300 "movw %%si, (%%edi, %%eax, 2) \n\t" | |
2301 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
2302 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry | |
2303 | |
2304 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |
2305 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |
2306 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2307 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2308 "shll $16, %%edi \n\t" | |
2309 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2310 "movl %1, %%edi \n\t" | |
2311 "shrl $9, %%esi \n\t" | |
2312 "movw %%si, 2(%%edi, %%eax, 2) \n\t" | |
2313 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
2314 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry | |
2315 | |
2316 | |
2317 "addl $2, %%eax \n\t" | |
2318 "cmpl %2, %%eax \n\t" | |
2319 " jb 1b \n\t" | |
2320 | |
2321 | |
2322 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF) | |
2323 : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |
2324 ); | |
2325 #ifdef HAVE_MMX2 | |
11000 | 2326 } //if MMX2 can't be used |
2469 | 2327 #endif |
2328 #else | |
2671 | 2329 int i; |
2330 unsigned int xpos=0; | |
2331 for(i=0;i<dstWidth;i++) | |
2332 { | |
2333 register unsigned int xx=xpos>>16; | |
2334 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2335 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; | |
2336 xpos+=xInc; | |
2337 } | |
2469 | 2338 #endif |
3272 | 2339 } |
2469 | 2340 } |
2341 | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2342 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2, |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2343 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, |
4467 | 2344 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, |
5452 | 2345 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, |
2346 int32_t *mmx2FilterPos) | |
2469 | 2347 { |
4467 | 2348 if(srcFormat==IMGFMT_YUY2) |
2349 { | |
2350 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2351 src1= formatConvBuffer; | |
2352 src2= formatConvBuffer+2048; | |
2353 } | |
9071 | 2354 else if(srcFormat==IMGFMT_UYVY) |
2355 { | |
2356 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2357 src1= formatConvBuffer; | |
2358 src2= formatConvBuffer+2048; | |
2359 } | |
4467 | 2360 else if(srcFormat==IMGFMT_BGR32) |
2361 { | |
2362 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2363 src1= formatConvBuffer; | |
2364 src2= formatConvBuffer+2048; | |
2365 } | |
2366 else if(srcFormat==IMGFMT_BGR24) | |
2367 { | |
2368 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2369 src1= formatConvBuffer; | |
2370 src2= formatConvBuffer+2048; | |
2371 } | |
4578 | 2372 else if(srcFormat==IMGFMT_BGR16) |
2373 { | |
2374 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2375 src1= formatConvBuffer; | |
2376 src2= formatConvBuffer+2048; | |
2377 } | |
4580 | 2378 else if(srcFormat==IMGFMT_BGR15) |
2379 { | |
2380 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2381 src1= formatConvBuffer; | |
2382 src2= formatConvBuffer+2048; | |
2383 } | |
4558 | 2384 else if(srcFormat==IMGFMT_RGB32) |
2385 { | |
2386 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2387 src1= formatConvBuffer; | |
2388 src2= formatConvBuffer+2048; | |
2389 } | |
2390 else if(srcFormat==IMGFMT_RGB24) | |
2391 { | |
2392 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2393 src1= formatConvBuffer; | |
2394 src2= formatConvBuffer+2048; | |
2395 } | |
4481 | 2396 else if(isGray(srcFormat)) |
2397 { | |
2398 return; | |
2399 } | |
4467 | 2400 |
3352 | 2401 #ifdef HAVE_MMX |
11000 | 2402 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one) |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2403 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) |
3352 | 2404 #else |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2405 if(!(flags&SWS_FAST_BILINEAR)) |
3352 | 2406 #endif |
3272 | 2407 { |
2408 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2409 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2410 } | |
2411 else // Fast Bilinear upscale / crap downscale | |
2412 { | |
2469 | 2413 #ifdef ARCH_X86 |
2414 #ifdef HAVE_MMX2 | |
2671 | 2415 int i; |
2469 | 2416 if(canMMX2BeUsed) |
2417 { | |
2418 asm volatile( | |
5452 | 2419 "pxor %%mm7, %%mm7 \n\t" |
2420 "movl %0, %%ecx \n\t" | |
2421 "movl %1, %%edi \n\t" | |
2422 "movl %2, %%edx \n\t" | |
2423 "movl %3, %%ebx \n\t" | |
2424 "xorl %%eax, %%eax \n\t" // i | |
2425 PREFETCH" (%%ecx) \n\t" | |
2426 PREFETCH" 32(%%ecx) \n\t" | |
2427 PREFETCH" 64(%%ecx) \n\t" | |
2428 | |
2429 #define FUNNY_UV_CODE \ | |
2430 "movl (%%ebx), %%esi \n\t"\ | |
2431 "call *%4 \n\t"\ | |
2432 "addl (%%ebx, %%eax), %%ecx \n\t"\ | |
2433 "addl %%eax, %%edi \n\t"\ | |
2434 "xorl %%eax, %%eax \n\t"\ | |
2469 | 2435 |
5452 | 2436 FUNNY_UV_CODE |
2437 FUNNY_UV_CODE | |
2438 FUNNY_UV_CODE | |
2439 FUNNY_UV_CODE | |
2440 "xorl %%eax, %%eax \n\t" // i | |
2441 "movl %5, %%ecx \n\t" // src | |
2442 "movl %1, %%edi \n\t" // buf1 | |
2443 "addl $4096, %%edi \n\t" | |
2444 PREFETCH" (%%ecx) \n\t" | |
2445 PREFETCH" 32(%%ecx) \n\t" | |
2446 PREFETCH" 64(%%ecx) \n\t" | |
2469 | 2447 |
5452 | 2448 FUNNY_UV_CODE |
2449 FUNNY_UV_CODE | |
2450 FUNNY_UV_CODE | |
2451 FUNNY_UV_CODE | |
2469 | 2452 |
5452 | 2453 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2454 "m" (funnyUVCode), "m" (src2) | |
2455 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |
2456 ); | |
3344 | 2457 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) |
2469 | 2458 { |
3344 | 2459 // printf("%d %d %d\n", dstWidth, i, srcW); |
2460 dst[i] = src1[srcW-1]*128; | |
2461 dst[i+2048] = src2[srcW-1]*128; | |
2469 | 2462 } |
2463 } | |
2464 else | |
2465 { | |
2466 #endif | |
2467 asm volatile( | |
2468 "xorl %%eax, %%eax \n\t" // i | |
2469 "xorl %%ebx, %%ebx \n\t" // xx | |
2470 "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
2471 ".balign 16 \n\t" |
2469 | 2472 "1: \n\t" |
2473 "movl %0, %%esi \n\t" | |
2474 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx] | |
2475 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1] | |
2476 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2477 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2478 "shll $16, %%edi \n\t" | |
2479 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2480 "movl %1, %%edi \n\t" | |
2481 "shrl $9, %%esi \n\t" | |
2482 "movw %%si, (%%edi, %%eax, 2) \n\t" | |
2483 | |
2484 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] | |
2485 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] | |
2486 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
2487 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2488 "shll $16, %%edi \n\t" | |
2489 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2490 "movl %1, %%edi \n\t" | |
2491 "shrl $9, %%esi \n\t" | |
2492 "movw %%si, 4096(%%edi, %%eax, 2)\n\t" | |
2493 | |
2494 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
2495 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry | |
2496 "addl $1, %%eax \n\t" | |
2497 "cmpl %2, %%eax \n\t" | |
2498 " jb 1b \n\t" | |
2499 | |
2500 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF), | |
2501 "r" (src2) | |
2502 : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |
2503 ); | |
2504 #ifdef HAVE_MMX2 | |
11000 | 2505 } //if MMX2 can't be used |
2469 | 2506 #endif |
2507 #else | |
2671 | 2508 int i; |
2509 unsigned int xpos=0; | |
2510 for(i=0;i<dstWidth;i++) | |
2511 { | |
2512 register unsigned int xx=xpos>>16; | |
2513 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2514 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |
2515 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | |
2566 | 2516 /* slower |
2517 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; | |
2518 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; | |
2519 */ | |
2671 | 2520 xpos+=xInc; |
2521 } | |
2469 | 2522 #endif |
3272 | 2523 } |
2524 } | |
2525 | |
9499 | 2526 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
2527 int srcSliceH, uint8_t* dst[], int dstStride[]){ | |
3344 | 2528 |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2529 /* load a few things into local vars to make the code more readable? and faster */ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2530 const int srcW= c->srcW; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2531 const int dstW= c->dstW; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2532 const int dstH= c->dstH; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2533 const int chrDstW= c->chrDstW; |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2534 const int chrSrcW= c->chrSrcW; |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2535 const int lumXInc= c->lumXInc; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2536 const int chrXInc= c->chrXInc; |
4295 | 2537 const int dstFormat= c->dstFormat; |
6503 | 2538 const int srcFormat= c->srcFormat; |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2539 const int flags= c->flags; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2540 const int canMMX2BeUsed= c->canMMX2BeUsed; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2541 int16_t *vLumFilterPos= c->vLumFilterPos; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2542 int16_t *vChrFilterPos= c->vChrFilterPos; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2543 int16_t *hLumFilterPos= c->hLumFilterPos; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2544 int16_t *hChrFilterPos= c->hChrFilterPos; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2545 int16_t *vLumFilter= c->vLumFilter; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2546 int16_t *vChrFilter= c->vChrFilter; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2547 int16_t *hLumFilter= c->hLumFilter; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2548 int16_t *hChrFilter= c->hChrFilter; |
9413 | 2549 int32_t *lumMmxFilter= c->lumMmxFilter; |
2550 int32_t *chrMmxFilter= c->chrMmxFilter; | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2551 const int vLumFilterSize= c->vLumFilterSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2552 const int vChrFilterSize= c->vChrFilterSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2553 const int hLumFilterSize= c->hLumFilterSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2554 const int hChrFilterSize= c->hChrFilterSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2555 int16_t **lumPixBuf= c->lumPixBuf; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2556 int16_t **chrPixBuf= c->chrPixBuf; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2557 const int vLumBufSize= c->vLumBufSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2558 const int vChrBufSize= c->vChrBufSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2559 uint8_t *funnyYCode= c->funnyYCode; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2560 uint8_t *funnyUVCode= c->funnyUVCode; |
4467 | 2561 uint8_t *formatConvBuffer= c->formatConvBuffer; |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2562 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample; |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2563 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample); |
9494
543ab3909b78
sws_ prefix, more seperation between internal & external swscaler API
michael
parents:
9476
diff
changeset
|
2564 int lastDstY; |
3344 | 2565 |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2566 /* vars whch will change and which we need to storw back in the context */ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2567 int dstY= c->dstY; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2568 int lumBufIndex= c->lumBufIndex; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2569 int chrBufIndex= c->chrBufIndex; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2570 int lastInLumBuf= c->lastInLumBuf; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2571 int lastInChrBuf= c->lastInChrBuf; |
6540 | 2572 |
2573 if(isPacked(c->srcFormat)){ | |
4467 | 2574 src[0]= |
2575 src[1]= | |
9499 | 2576 src[2]= src[0]; |
6540 | 2577 srcStride[0]= |
4467 | 2578 srcStride[1]= |
9499 | 2579 srcStride[2]= srcStride[0]; |
4467 | 2580 } |
6540 | 2581 srcStride[1]<<= c->vChrDrop; |
2582 srcStride[2]<<= c->vChrDrop; | |
4419 | 2583 |
6517 | 2584 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2], |
2585 // (int)dst[0], (int)dst[1], (int)dst[2]); | |
2586 | |
2587 #if 0 //self test FIXME move to a vfilter or something | |
2588 { | |
2589 static volatile int i=0; | |
2590 i++; | |
2591 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH) | |
2592 selfTest(src, srcStride, c->srcW, c->srcH); | |
2593 i--; | |
2594 } | |
2595 #endif | |
4554 | 2596 |
2597 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2], | |
2598 //dstStride[0],dstStride[1],dstStride[2]); | |
4419 | 2599 |
2600 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0) | |
2601 { | |
2602 static int firstTime=1; //FIXME move this into the context perhaps | |
2603 if(flags & SWS_PRINT_INFO && firstTime) | |
2604 { | |
9970 | 2605 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n" |
4419 | 2606 "SwScaler: ->cannot do aligned memory acesses anymore\n"); |
2607 firstTime=0; | |
2608 } | |
2609 } | |
3344 | 2610 |
4467 | 2611 /* Note the user might start scaling the picture in the middle so this will not get executed |
2612 this is not really intended but works currently, so ppl might do it */ | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2613 if(srcSliceY ==0){ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2614 lumBufIndex=0; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2615 chrBufIndex=0; |
4467 | 2616 dstY=0; |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2617 lastInLumBuf= -1; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2618 lastInChrBuf= -1; |
3272 | 2619 } |
3344 | 2620 |
9494
543ab3909b78
sws_ prefix, more seperation between internal & external swscaler API
michael
parents:
9476
diff
changeset
|
2621 lastDstY= dstY; |
543ab3909b78
sws_ prefix, more seperation between internal & external swscaler API
michael
parents:
9476
diff
changeset
|
2622 |
3344 | 2623 for(;dstY < dstH; dstY++){ |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2624 unsigned char *dest =dst[0]+dstStride[0]*dstY; |
6520 | 2625 const int chrDstY= dstY>>c->chrDstVSubSample; |
2626 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY; | |
2627 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY; | |
3344 | 2628 |
2629 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input | |
2630 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input | |
2631 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input | |
2632 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input | |
2633 | |
11122 | 2634 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n", |
2635 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample); | |
4290
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2636 //handle holes (FAST_BILINEAR & weird filters) |
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2637 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; |
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2638 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; |
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2639 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize); |
3344 | 2640 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1) |
2641 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1) | |
2216 | 2642 |
3344 | 2643 // Do we have enough lines in this slice to output the dstY line |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2644 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample)) |
2469 | 2645 { |
3344 | 2646 //Do horizontal scaling |
2647 while(lastInLumBuf < lastLumSrcY) | |
2648 { | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2649 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; |
3344 | 2650 lumBufIndex++; |
4290
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2651 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY); |
3344 | 2652 ASSERT(lumBufIndex < 2*vLumBufSize) |
2653 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
2654 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
2655 // printf("%d %d\n", lumBufIndex, vLumBufSize); | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2656 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2657 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, |
5452 | 2658 funnyYCode, c->srcFormat, formatConvBuffer, |
2659 c->lumMmx2Filter, c->lumMmx2FilterPos); | |
3344 | 2660 lastInLumBuf++; |
2661 } | |
2662 while(lastInChrBuf < lastChrSrcY) | |
2663 { | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2664 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2665 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; |
3344 | 2666 chrBufIndex++; |
2667 ASSERT(chrBufIndex < 2*vChrBufSize) | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2668 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)) |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2669 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0) |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2670 //FIXME replace parameters through context struct (some at least) |
6503 | 2671 |
2672 if(!(isGray(srcFormat) || isGray(dstFormat))) | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2673 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2674 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, |
5452 | 2675 funnyUVCode, c->srcFormat, formatConvBuffer, |
2676 c->chrMmx2Filter, c->chrMmx2FilterPos); | |
3344 | 2677 lastInChrBuf++; |
2678 } | |
2679 //wrap buf index around to stay inside the ring buffer | |
2680 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
2681 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
2469 | 2682 } |
3344 | 2683 else // not enough lines left in this slice -> load the rest in the buffer |
2469 | 2684 { |
3344 | 2685 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n", |
2686 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY, | |
2687 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize, | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2688 vChrBufSize, vLumBufSize);*/ |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2689 |
3344 | 2690 //Do horizontal scaling |
2691 while(lastInLumBuf+1 < srcSliceY + srcSliceH) | |
2469 | 2692 { |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2693 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; |
3344 | 2694 lumBufIndex++; |
2695 ASSERT(lumBufIndex < 2*vLumBufSize) | |
2696 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
2697 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2698 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2699 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, |
5452 | 2700 funnyYCode, c->srcFormat, formatConvBuffer, |
2701 c->lumMmx2Filter, c->lumMmx2FilterPos); | |
3344 | 2702 lastInLumBuf++; |
2469 | 2703 } |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2704 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH)) |
3344 | 2705 { |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2706 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2707 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; |
3344 | 2708 chrBufIndex++; |
2709 ASSERT(chrBufIndex < 2*vChrBufSize) | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2710 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH) |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2711 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0) |
6503 | 2712 |
2713 if(!(isGray(srcFormat) || isGray(dstFormat))) | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2714 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2715 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, |
5452 | 2716 funnyUVCode, c->srcFormat, formatConvBuffer, |
2717 c->chrMmx2Filter, c->chrMmx2FilterPos); | |
3344 | 2718 lastInChrBuf++; |
2719 } | |
2720 //wrap buf index around to stay inside the ring buffer | |
2721 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
2722 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
11000 | 2723 break; //we can't output a dstY line so let's try with the next slice |
2469 | 2724 } |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
2725 |
2748 | 2726 #ifdef HAVE_MMX |
3344 | 2727 b5Dither= dither8[dstY&1]; |
2728 g6Dither= dither4[dstY&1]; | |
2729 g5Dither= dither8[dstY&1]; | |
2730 r5Dither= dither8[(dstY+1)&1]; | |
2748 | 2731 #endif |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2732 if(dstY < dstH-2) |
3352 | 2733 { |
9414 | 2734 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; |
2735 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
2736 #ifdef HAVE_MMX | |
2737 int i; | |
2738 for(i=0; i<vLumFilterSize; i++) | |
2739 { | |
2740 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i]; | |
2741 lumMmxFilter[4*i+2]= | |
2742 lumMmxFilter[4*i+3]= | |
2743 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; | |
2744 } | |
2745 for(i=0; i<vChrFilterSize; i++) | |
2746 { | |
2747 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i]; | |
2748 chrMmxFilter[4*i+2]= | |
2749 chrMmxFilter[4*i+3]= | |
2750 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; | |
2751 } | |
2752 #endif | |
6503 | 2753 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like |
3344 | 2754 { |
7351 | 2755 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; |
2756 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
3344 | 2757 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12 |
2758 { | |
2759 int16_t *lumBuf = lumPixBuf[0]; | |
2760 int16_t *chrBuf= chrPixBuf[0]; | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2761 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW); |
3344 | 2762 } |
2763 else //General YV12 | |
2764 { | |
9413 | 2765 RENAME(yuv2yuvX)(c, |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2766 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2767 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, |
9414 | 2768 dest, uDest, vDest, dstW, chrDstW); |
3344 | 2769 } |
2770 } | |
2771 else | |
2772 { | |
2773 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
2774 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
2775 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB | |
2776 { | |
2777 int chrAlpha= vChrFilter[2*dstY+1]; | |
7723 | 2778 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), |
6578 | 2779 dest, dstW, chrAlpha, dstFormat, flags, dstY); |
3344 | 2780 } |
2781 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB | |
2782 { | |
2783 int lumAlpha= vLumFilter[2*dstY+1]; | |
2784 int chrAlpha= vChrFilter[2*dstY+1]; | |
7723 | 2785 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), |
6578 | 2786 dest, dstW, lumAlpha, chrAlpha, dstY); |
3344 | 2787 } |
2788 else //General RGB | |
2789 { | |
7723 | 2790 RENAME(yuv2packedX)(c, |
3344 | 2791 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, |
2792 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
9413 | 2793 dest, dstW, dstY); |
3344 | 2794 } |
2795 } | |
3352 | 2796 } |
11000 | 2797 else // hmm looks like we can't use MMX here without overwriting this array's tail |
3352 | 2798 { |
2799 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
2800 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
6615 | 2801 if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 |
3352 | 2802 { |
7351 | 2803 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; |
2804 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
6540 | 2805 yuv2yuvXinC( |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2806 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2807 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, |
6540 | 2808 dest, uDest, vDest, dstW, chrDstW); |
3352 | 2809 } |
2810 else | |
2811 { | |
2812 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
2813 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
7723 | 2814 yuv2packedXinC(c, |
3352 | 2815 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, |
2816 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
6578 | 2817 dest, dstW, dstY); |
3352 | 2818 } |
2819 } | |
3344 | 2820 } |
2534
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2821 |
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2822 #ifdef HAVE_MMX |
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2823 __asm __volatile(SFENCE:::"memory"); |
2566 | 2824 __asm __volatile(EMMS:::"memory"); |
2534
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2825 #endif |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2826 /* store changed local vars back in the context */ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2827 c->dstY= dstY; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2828 c->lumBufIndex= lumBufIndex; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2829 c->chrBufIndex= chrBufIndex; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2830 c->lastInLumBuf= lastInLumBuf; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2831 c->lastInChrBuf= lastInChrBuf; |
9494
543ab3909b78
sws_ prefix, more seperation between internal & external swscaler API
michael
parents:
9476
diff
changeset
|
2832 |
543ab3909b78
sws_ prefix, more seperation between internal & external swscaler API
michael
parents:
9476
diff
changeset
|
2833 return dstY - lastDstY; |
3641 | 2834 } |