Mercurial > mplayer.hg
annotate libswscale/swscale_template.c @ 19619:a83e5b8d2e63
Patch from Karolina Lindqvist <karolina.lindqvist@kramnet.se>
"There is a bug in the zoran -vo zr driver, that makes the output garbled
always. It also probably affects the zrmjpeg filter. This patch takes care of
the problem."
Patch tested and OK. And 10l to me, because this bug probably has existed for a
looong time.
author | rik |
---|---|
date | Fri, 01 Sep 2006 18:49:40 +0000 |
parents | 4678e9f81334 |
children | 8e50cba9fe03 |
rev | line source |
---|---|
18861 | 1 /* |
2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> | |
3 | |
4 This program is free software; you can redistribute it and/or modify | |
5 it under the terms of the GNU General Public License as published by | |
6 the Free Software Foundation; either version 2 of the License, or | |
7 (at your option) any later version. | |
8 | |
9 This program is distributed in the hope that it will be useful, | |
10 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 GNU General Public License for more details. | |
13 | |
14 You should have received a copy of the GNU General Public License | |
15 along with this program; if not, write to the Free Software | |
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | |
19594
4678e9f81334
make the C code of the swscaler which i wrote LGPL
michael
parents:
19400
diff
changeset
|
17 |
4678e9f81334
make the C code of the swscaler which i wrote LGPL
michael
parents:
19400
diff
changeset
|
18 the C code (not assembly, mmx, ...) of the swscaler which has been written |
4678e9f81334
make the C code of the swscaler which i wrote LGPL
michael
parents:
19400
diff
changeset
|
19 by Michael Niedermayer can be used under the LGPL license too |
18861 | 20 */ |
21 | |
22 #undef REAL_MOVNTQ | |
23 #undef MOVNTQ | |
24 #undef PAVGB | |
25 #undef PREFETCH | |
26 #undef PREFETCHW | |
27 #undef EMMS | |
28 #undef SFENCE | |
29 | |
30 #ifdef HAVE_3DNOW | |
31 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
32 #define EMMS "femms" | |
33 #else | |
34 #define EMMS "emms" | |
35 #endif | |
36 | |
37 #ifdef HAVE_3DNOW | |
38 #define PREFETCH "prefetch" | |
39 #define PREFETCHW "prefetchw" | |
40 #elif defined ( HAVE_MMX2 ) | |
41 #define PREFETCH "prefetchnta" | |
42 #define PREFETCHW "prefetcht0" | |
43 #else | |
44 #define PREFETCH "/nop" | |
45 #define PREFETCHW "/nop" | |
46 #endif | |
47 | |
48 #ifdef HAVE_MMX2 | |
49 #define SFENCE "sfence" | |
50 #else | |
51 #define SFENCE "/nop" | |
52 #endif | |
53 | |
54 #ifdef HAVE_MMX2 | |
55 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | |
56 #elif defined (HAVE_3DNOW) | |
57 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | |
58 #endif | |
59 | |
60 #ifdef HAVE_MMX2 | |
61 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" | |
62 #else | |
63 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" | |
64 #endif | |
65 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) | |
66 | |
67 #ifdef HAVE_ALTIVEC | |
68 #include "swscale_altivec_template.c" | |
69 #endif | |
70 | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
71 #define YSCALEYUV2YV12X(x, offset, dest, width) \ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
72 asm volatile(\ |
18861 | 73 "xor %%"REG_a", %%"REG_a" \n\t"\ |
74 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ | |
75 "movq %%mm3, %%mm4 \n\t"\ | |
76 "lea " offset "(%0), %%"REG_d" \n\t"\ | |
77 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
78 ASMALIGN(4) /* FIXME Unroll? */\ |
18861 | 79 "1: \n\t"\ |
80 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
81 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\ | |
82 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\ | |
83 "add $16, %%"REG_d" \n\t"\ | |
84 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
85 "test %%"REG_S", %%"REG_S" \n\t"\ | |
86 "pmulhw %%mm0, %%mm2 \n\t"\ | |
87 "pmulhw %%mm0, %%mm5 \n\t"\ | |
88 "paddw %%mm2, %%mm3 \n\t"\ | |
89 "paddw %%mm5, %%mm4 \n\t"\ | |
90 " jnz 1b \n\t"\ | |
91 "psraw $3, %%mm3 \n\t"\ | |
92 "psraw $3, %%mm4 \n\t"\ | |
93 "packuswb %%mm4, %%mm3 \n\t"\ | |
94 MOVNTQ(%%mm3, (%1, %%REGa))\ | |
95 "add $8, %%"REG_a" \n\t"\ | |
96 "cmp %2, %%"REG_a" \n\t"\ | |
97 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ | |
98 "movq %%mm3, %%mm4 \n\t"\ | |
99 "lea " offset "(%0), %%"REG_d" \n\t"\ | |
100 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
101 "jb 1b \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
102 :: "r" (&c->redDither),\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
103 "r" (dest), "p" (width)\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
104 : "%"REG_a, "%"REG_d, "%"REG_S\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
105 ); |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
106 |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
107 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
108 asm volatile(\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
109 "lea " offset "(%0), %%"REG_d" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
110 "xor %%"REG_a", %%"REG_a" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
111 "pxor %%mm4, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
112 "pxor %%mm5, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
113 "pxor %%mm6, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
114 "pxor %%mm7, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
115 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
116 ASMALIGN(4) \ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
117 "1: \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
118 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0\n\t" /* srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
119 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
120 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
121 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1\n\t" /* srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
122 "movq %%mm0, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
123 "punpcklwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
124 "punpckhwd %%mm1, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
125 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
126 "pmaddwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
127 "pmaddwd %%mm1, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
128 "paddd %%mm0, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
129 "paddd %%mm3, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
130 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3\n\t" /* srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
131 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
132 "add $16, %%"REG_d" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
133 "test %%"REG_S", %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
134 "movq %%mm2, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
135 "punpcklwd %%mm3, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
136 "punpckhwd %%mm3, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
137 "pmaddwd %%mm1, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
138 "pmaddwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
139 "paddd %%mm2, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
140 "paddd %%mm0, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
141 " jnz 1b \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
142 "psrad $16, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
143 "psrad $16, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
144 "psrad $16, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
145 "psrad $16, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
146 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
147 "packssdw %%mm5, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
148 "packssdw %%mm7, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
149 "paddw %%mm0, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
150 "paddw %%mm0, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
151 "psraw $3, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
152 "psraw $3, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
153 "packuswb %%mm6, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
154 MOVNTQ(%%mm4, (%1, %%REGa))\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
155 "add $8, %%"REG_a" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
156 "cmp %2, %%"REG_a" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
157 "lea " offset "(%0), %%"REG_d" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
158 "pxor %%mm4, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
159 "pxor %%mm5, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
160 "pxor %%mm6, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
161 "pxor %%mm7, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
162 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
163 "jb 1b \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
164 :: "r" (&c->redDither),\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
165 "r" (dest), "p" (width)\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
166 : "%"REG_a, "%"REG_d, "%"REG_S\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
167 ); |
18861 | 168 |
169 #define YSCALEYUV2YV121 \ | |
170 "mov %2, %%"REG_a" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
171 ASMALIGN(4) /* FIXME Unroll? */\ |
18861 | 172 "1: \n\t"\ |
173 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ | |
174 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\ | |
175 "psraw $7, %%mm0 \n\t"\ | |
176 "psraw $7, %%mm1 \n\t"\ | |
177 "packuswb %%mm1, %%mm0 \n\t"\ | |
178 MOVNTQ(%%mm0, (%1, %%REGa))\ | |
179 "add $8, %%"REG_a" \n\t"\ | |
180 "jnc 1b \n\t" | |
181 | |
182 /* | |
183 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
184 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
185 "r" (dest), "m" (dstW), | |
186 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
187 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
188 */ | |
189 #define YSCALEYUV2PACKEDX \ | |
19173 | 190 asm volatile(\ |
18861 | 191 "xor %%"REG_a", %%"REG_a" \n\t"\ |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
192 ASMALIGN(4)\ |
18861 | 193 "nop \n\t"\ |
194 "1: \n\t"\ | |
195 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ | |
196 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
197 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ | |
198 "movq %%mm3, %%mm4 \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
199 ASMALIGN(4)\ |
18861 | 200 "2: \n\t"\ |
201 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
202 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ | |
203 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ | |
204 "add $16, %%"REG_d" \n\t"\ | |
205 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
206 "pmulhw %%mm0, %%mm2 \n\t"\ | |
207 "pmulhw %%mm0, %%mm5 \n\t"\ | |
208 "paddw %%mm2, %%mm3 \n\t"\ | |
209 "paddw %%mm5, %%mm4 \n\t"\ | |
210 "test %%"REG_S", %%"REG_S" \n\t"\ | |
211 " jnz 2b \n\t"\ | |
212 \ | |
213 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ | |
214 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
215 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\ | |
216 "movq %%mm1, %%mm7 \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
217 ASMALIGN(4)\ |
18861 | 218 "2: \n\t"\ |
219 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
220 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\ | |
221 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\ | |
222 "add $16, %%"REG_d" \n\t"\ | |
223 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
224 "pmulhw %%mm0, %%mm2 \n\t"\ | |
225 "pmulhw %%mm0, %%mm5 \n\t"\ | |
226 "paddw %%mm2, %%mm1 \n\t"\ | |
227 "paddw %%mm5, %%mm7 \n\t"\ | |
228 "test %%"REG_S", %%"REG_S" \n\t"\ | |
229 " jnz 2b \n\t"\ | |
230 | |
19173 | 231 #define YSCALEYUV2PACKEDX_END\ |
232 :: "r" (&c->redDither), \ | |
233 "m" (dummy), "m" (dummy), "m" (dummy),\ | |
234 "r" (dest), "m" (dstW)\ | |
235 : "%"REG_a, "%"REG_d, "%"REG_S\ | |
236 ); | |
237 | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
238 #define YSCALEYUV2PACKEDX_ACCURATE \ |
19173 | 239 asm volatile(\ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
240 "xor %%"REG_a", %%"REG_a" \n\t"\ |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
241 ASMALIGN(4)\ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
242 "nop \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
243 "1: \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
244 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
245 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
246 "pxor %%mm4, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
247 "pxor %%mm5, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
248 "pxor %%mm6, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
249 "pxor %%mm7, %%mm7 \n\t"\ |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
250 ASMALIGN(4)\ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
251 "2: \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
252 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
253 "movq 4096(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
254 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
255 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
256 "movq %%mm0, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
257 "punpcklwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
258 "punpckhwd %%mm1, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
259 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
260 "pmaddwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
261 "pmaddwd %%mm1, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
262 "paddd %%mm0, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
263 "paddd %%mm3, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
264 "movq 4096(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
265 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
266 "add $16, %%"REG_d" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
267 "test %%"REG_S", %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
268 "movq %%mm2, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
269 "punpcklwd %%mm3, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
270 "punpckhwd %%mm3, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
271 "pmaddwd %%mm1, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
272 "pmaddwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
273 "paddd %%mm2, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
274 "paddd %%mm0, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
275 " jnz 2b \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
276 "psrad $16, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
277 "psrad $16, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
278 "psrad $16, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
279 "psrad $16, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
280 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
281 "packssdw %%mm5, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
282 "packssdw %%mm7, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
283 "paddw %%mm0, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
284 "paddw %%mm0, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
285 "movq %%mm4, "U_TEMP"(%0) \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
286 "movq %%mm6, "V_TEMP"(%0) \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
287 \ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
288 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
289 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
290 "pxor %%mm1, %%mm1 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
291 "pxor %%mm5, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
292 "pxor %%mm7, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
293 "pxor %%mm6, %%mm6 \n\t"\ |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
294 ASMALIGN(4)\ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
295 "2: \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
296 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
297 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
298 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
299 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
300 "movq %%mm0, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
301 "punpcklwd %%mm4, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
302 "punpckhwd %%mm4, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
303 "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
304 "pmaddwd %%mm4, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
305 "pmaddwd %%mm4, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
306 "paddd %%mm0, %%mm1 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
307 "paddd %%mm3, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
308 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
309 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
310 "add $16, %%"REG_d" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
311 "test %%"REG_S", %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
312 "movq %%mm2, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
313 "punpcklwd %%mm3, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
314 "punpckhwd %%mm3, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
315 "pmaddwd %%mm4, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
316 "pmaddwd %%mm4, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
317 "paddd %%mm2, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
318 "paddd %%mm0, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
319 " jnz 2b \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
320 "psrad $16, %%mm1 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
321 "psrad $16, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
322 "psrad $16, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
323 "psrad $16, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
324 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
325 "packssdw %%mm5, %%mm1 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
326 "packssdw %%mm6, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
327 "paddw %%mm0, %%mm1 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
328 "paddw %%mm0, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
329 "movq "U_TEMP"(%0), %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
330 "movq "V_TEMP"(%0), %%mm4 \n\t"\ |
18861 | 331 |
19173 | 332 #define YSCALEYUV2RGBX \ |
18861 | 333 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ |
334 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ | |
335 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
336 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
337 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ | |
338 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ | |
339 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
340 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ | |
341 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ | |
342 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ | |
343 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ | |
344 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ | |
345 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ | |
346 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
347 "paddw %%mm3, %%mm4 \n\t"\ | |
348 "movq %%mm2, %%mm0 \n\t"\ | |
349 "movq %%mm5, %%mm6 \n\t"\ | |
350 "movq %%mm4, %%mm3 \n\t"\ | |
351 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
352 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
353 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
354 "paddw %%mm1, %%mm2 \n\t"\ | |
355 "paddw %%mm1, %%mm5 \n\t"\ | |
356 "paddw %%mm1, %%mm4 \n\t"\ | |
357 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
358 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
359 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
360 "paddw %%mm7, %%mm0 \n\t"\ | |
361 "paddw %%mm7, %%mm6 \n\t"\ | |
362 "paddw %%mm7, %%mm3 \n\t"\ | |
363 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
364 "packuswb %%mm0, %%mm2 \n\t"\ | |
365 "packuswb %%mm6, %%mm5 \n\t"\ | |
366 "packuswb %%mm3, %%mm4 \n\t"\ | |
367 "pxor %%mm7, %%mm7 \n\t" | |
368 #if 0 | |
369 #define FULL_YSCALEYUV2RGB \ | |
370 "pxor %%mm7, %%mm7 \n\t"\ | |
371 "movd %6, %%mm6 \n\t" /*yalpha1*/\ | |
372 "punpcklwd %%mm6, %%mm6 \n\t"\ | |
373 "punpcklwd %%mm6, %%mm6 \n\t"\ | |
374 "movd %7, %%mm5 \n\t" /*uvalpha1*/\ | |
375 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
376 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
377 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
378 ASMALIGN(4)\ |
18861 | 379 "1: \n\t"\ |
380 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
381 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
382 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
383 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
384 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
385 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
386 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
387 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
388 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
389 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
390 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
391 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
392 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |
393 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
394 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
395 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\ | |
396 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\ | |
397 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\ | |
398 \ | |
399 \ | |
400 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
401 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
402 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\ | |
403 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
404 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\ | |
405 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
406 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\ | |
407 \ | |
408 \ | |
409 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | |
410 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\ | |
411 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\ | |
412 "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |
413 "paddw %%mm1, %%mm0 \n\t" /* R*/\ | |
414 "packuswb %%mm3, %%mm3 \n\t"\ | |
415 \ | |
416 "packuswb %%mm0, %%mm0 \n\t"\ | |
417 "paddw %%mm4, %%mm2 \n\t"\ | |
418 "paddw %%mm2, %%mm1 \n\t" /* G*/\ | |
419 \ | |
420 "packuswb %%mm1, %%mm1 \n\t" | |
421 #endif | |
422 | |
423 #define REAL_YSCALEYUV2PACKED(index, c) \ | |
424 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ | |
425 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\ | |
426 "psraw $3, %%mm0 \n\t"\ | |
427 "psraw $3, %%mm1 \n\t"\ | |
428 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\ | |
429 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\ | |
430 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
431 ASMALIGN(4)\ |
18861 | 432 "1: \n\t"\ |
433 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
434 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
435 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
436 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
437 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
438 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
439 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ | |
440 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
441 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
442 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
443 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
444 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
445 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
446 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
447 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
448 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ | |
449 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ | |
450 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
451 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
452 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
453 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
454 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
455 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
456 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
457 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
458 | |
459 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) | |
460 | |
461 #define REAL_YSCALEYUV2RGB(index, c) \ | |
462 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
463 ASMALIGN(4)\ |
18861 | 464 "1: \n\t"\ |
465 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
466 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
467 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\ | |
468 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\ | |
469 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
470 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
471 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ | |
472 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
473 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
474 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
475 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
476 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
477 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
478 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
479 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
480 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
481 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
482 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ | |
483 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
484 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
485 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
486 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
487 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\ | |
488 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\ | |
489 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
490 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
491 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
492 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
493 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
494 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
495 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
496 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
497 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ | |
498 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
499 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
500 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
501 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
502 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
503 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
504 "paddw %%mm3, %%mm4 \n\t"\ | |
505 "movq %%mm2, %%mm0 \n\t"\ | |
506 "movq %%mm5, %%mm6 \n\t"\ | |
507 "movq %%mm4, %%mm3 \n\t"\ | |
508 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
509 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
510 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
511 "paddw %%mm1, %%mm2 \n\t"\ | |
512 "paddw %%mm1, %%mm5 \n\t"\ | |
513 "paddw %%mm1, %%mm4 \n\t"\ | |
514 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
515 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
516 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
517 "paddw %%mm7, %%mm0 \n\t"\ | |
518 "paddw %%mm7, %%mm6 \n\t"\ | |
519 "paddw %%mm7, %%mm3 \n\t"\ | |
520 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
521 "packuswb %%mm0, %%mm2 \n\t"\ | |
522 "packuswb %%mm6, %%mm5 \n\t"\ | |
523 "packuswb %%mm3, %%mm4 \n\t"\ | |
524 "pxor %%mm7, %%mm7 \n\t" | |
525 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c) | |
526 | |
527 #define REAL_YSCALEYUV2PACKED1(index, c) \ | |
528 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
529 ASMALIGN(4)\ |
18861 | 530 "1: \n\t"\ |
531 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ | |
532 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
533 "psraw $7, %%mm3 \n\t" \ | |
534 "psraw $7, %%mm4 \n\t" \ | |
535 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
536 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
537 "psraw $7, %%mm1 \n\t" \ | |
538 "psraw $7, %%mm7 \n\t" \ | |
539 | |
540 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) | |
541 | |
542 #define REAL_YSCALEYUV2RGB1(index, c) \ | |
543 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
544 ASMALIGN(4)\ |
18861 | 545 "1: \n\t"\ |
546 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ | |
547 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
548 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
549 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
550 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
551 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
552 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
553 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
554 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ | |
555 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
556 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
557 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
558 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
559 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
560 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
561 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ | |
562 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
563 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
564 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
565 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
566 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
567 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
568 "paddw %%mm3, %%mm4 \n\t"\ | |
569 "movq %%mm2, %%mm0 \n\t"\ | |
570 "movq %%mm5, %%mm6 \n\t"\ | |
571 "movq %%mm4, %%mm3 \n\t"\ | |
572 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
573 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
574 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
575 "paddw %%mm1, %%mm2 \n\t"\ | |
576 "paddw %%mm1, %%mm5 \n\t"\ | |
577 "paddw %%mm1, %%mm4 \n\t"\ | |
578 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
579 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
580 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
581 "paddw %%mm7, %%mm0 \n\t"\ | |
582 "paddw %%mm7, %%mm6 \n\t"\ | |
583 "paddw %%mm7, %%mm3 \n\t"\ | |
584 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
585 "packuswb %%mm0, %%mm2 \n\t"\ | |
586 "packuswb %%mm6, %%mm5 \n\t"\ | |
587 "packuswb %%mm3, %%mm4 \n\t"\ | |
588 "pxor %%mm7, %%mm7 \n\t" | |
589 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) | |
590 | |
591 #define REAL_YSCALEYUV2PACKED1b(index, c) \ | |
592 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
593 ASMALIGN(4)\ |
18861 | 594 "1: \n\t"\ |
595 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
596 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
597 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
598 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
599 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ | |
600 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
601 "psrlw $8, %%mm3 \n\t" \ | |
602 "psrlw $8, %%mm4 \n\t" \ | |
603 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
604 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
605 "psraw $7, %%mm1 \n\t" \ | |
606 "psraw $7, %%mm7 \n\t" | |
607 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) | |
608 | |
609 // do vertical chrominance interpolation | |
610 #define REAL_YSCALEYUV2RGB1b(index, c) \ | |
611 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
612 ASMALIGN(4)\ |
18861 | 613 "1: \n\t"\ |
614 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
615 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
616 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
617 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
618 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ | |
619 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
620 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ | |
621 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ | |
622 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
623 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
624 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
625 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
626 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ | |
627 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
628 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
629 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
630 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
631 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
632 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
633 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ | |
634 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
635 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
636 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
637 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
638 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
639 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
640 "paddw %%mm3, %%mm4 \n\t"\ | |
641 "movq %%mm2, %%mm0 \n\t"\ | |
642 "movq %%mm5, %%mm6 \n\t"\ | |
643 "movq %%mm4, %%mm3 \n\t"\ | |
644 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
645 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
646 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
647 "paddw %%mm1, %%mm2 \n\t"\ | |
648 "paddw %%mm1, %%mm5 \n\t"\ | |
649 "paddw %%mm1, %%mm4 \n\t"\ | |
650 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
651 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
652 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
653 "paddw %%mm7, %%mm0 \n\t"\ | |
654 "paddw %%mm7, %%mm6 \n\t"\ | |
655 "paddw %%mm7, %%mm3 \n\t"\ | |
656 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
657 "packuswb %%mm0, %%mm2 \n\t"\ | |
658 "packuswb %%mm6, %%mm5 \n\t"\ | |
659 "packuswb %%mm3, %%mm4 \n\t"\ | |
660 "pxor %%mm7, %%mm7 \n\t" | |
661 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) | |
662 | |
663 #define REAL_WRITEBGR32(dst, dstw, index) \ | |
664 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
665 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
666 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
667 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
668 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
669 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
670 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
671 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
672 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
673 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
674 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
675 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
676 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
677 \ | |
678 MOVNTQ(%%mm0, (dst, index, 4))\ | |
679 MOVNTQ(%%mm2, 8(dst, index, 4))\ | |
680 MOVNTQ(%%mm1, 16(dst, index, 4))\ | |
681 MOVNTQ(%%mm3, 24(dst, index, 4))\ | |
682 \ | |
683 "add $8, "#index" \n\t"\ | |
684 "cmp "#dstw", "#index" \n\t"\ | |
685 " jb 1b \n\t" | |
686 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index) | |
687 | |
688 #define REAL_WRITEBGR16(dst, dstw, index) \ | |
689 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ | |
690 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ | |
691 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
692 "psrlq $3, %%mm2 \n\t"\ | |
693 \ | |
694 "movq %%mm2, %%mm1 \n\t"\ | |
695 "movq %%mm4, %%mm3 \n\t"\ | |
696 \ | |
697 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
698 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
699 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
700 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
701 \ | |
702 "psllq $3, %%mm3 \n\t"\ | |
703 "psllq $3, %%mm4 \n\t"\ | |
704 \ | |
705 "por %%mm3, %%mm2 \n\t"\ | |
706 "por %%mm4, %%mm1 \n\t"\ | |
707 \ | |
708 MOVNTQ(%%mm2, (dst, index, 2))\ | |
709 MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
710 \ | |
711 "add $8, "#index" \n\t"\ | |
712 "cmp "#dstw", "#index" \n\t"\ | |
713 " jb 1b \n\t" | |
714 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index) | |
715 | |
716 #define REAL_WRITEBGR15(dst, dstw, index) \ | |
717 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ | |
718 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ | |
719 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
720 "psrlq $3, %%mm2 \n\t"\ | |
721 "psrlq $1, %%mm5 \n\t"\ | |
722 \ | |
723 "movq %%mm2, %%mm1 \n\t"\ | |
724 "movq %%mm4, %%mm3 \n\t"\ | |
725 \ | |
726 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
727 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
728 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
729 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
730 \ | |
731 "psllq $2, %%mm3 \n\t"\ | |
732 "psllq $2, %%mm4 \n\t"\ | |
733 \ | |
734 "por %%mm3, %%mm2 \n\t"\ | |
735 "por %%mm4, %%mm1 \n\t"\ | |
736 \ | |
737 MOVNTQ(%%mm2, (dst, index, 2))\ | |
738 MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
739 \ | |
740 "add $8, "#index" \n\t"\ | |
741 "cmp "#dstw", "#index" \n\t"\ | |
742 " jb 1b \n\t" | |
743 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index) | |
744 | |
745 #define WRITEBGR24OLD(dst, dstw, index) \ | |
746 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
747 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
748 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
749 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
750 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
751 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
752 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
753 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
754 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
755 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
756 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
757 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
758 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
759 \ | |
760 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
761 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ | |
762 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\ | |
763 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\ | |
764 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
765 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ | |
766 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ | |
767 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
768 \ | |
769 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
770 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ | |
771 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ | |
772 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ | |
773 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\ | |
774 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ | |
775 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ | |
776 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\ | |
777 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\ | |
778 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ | |
779 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ | |
780 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ | |
781 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ | |
782 \ | |
783 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ | |
784 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ | |
785 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ | |
786 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\ | |
787 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\ | |
788 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ | |
789 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
790 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ | |
791 \ | |
792 MOVNTQ(%%mm0, (dst))\ | |
793 MOVNTQ(%%mm2, 8(dst))\ | |
794 MOVNTQ(%%mm3, 16(dst))\ | |
795 "add $24, "#dst" \n\t"\ | |
796 \ | |
797 "add $8, "#index" \n\t"\ | |
798 "cmp "#dstw", "#index" \n\t"\ | |
799 " jb 1b \n\t" | |
800 | |
801 #define WRITEBGR24MMX(dst, dstw, index) \ | |
802 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
803 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
804 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
805 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
806 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
807 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
808 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
809 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
810 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
811 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
812 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
813 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
814 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
815 \ | |
816 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
817 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ | |
818 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ | |
819 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ | |
820 \ | |
821 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ | |
822 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ | |
823 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ | |
824 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ | |
825 \ | |
826 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ | |
827 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ | |
828 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ | |
829 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ | |
830 \ | |
831 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
832 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ | |
833 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ | |
834 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
835 MOVNTQ(%%mm0, (dst))\ | |
836 \ | |
837 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ | |
838 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ | |
839 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ | |
840 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ | |
841 MOVNTQ(%%mm6, 8(dst))\ | |
842 \ | |
843 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ | |
844 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
845 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ | |
846 MOVNTQ(%%mm5, 16(dst))\ | |
847 \ | |
848 "add $24, "#dst" \n\t"\ | |
849 \ | |
850 "add $8, "#index" \n\t"\ | |
851 "cmp "#dstw", "#index" \n\t"\ | |
852 " jb 1b \n\t" | |
853 | |
854 #define WRITEBGR24MMX2(dst, dstw, index) \ | |
855 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
856 "movq "MANGLE(M24A)", %%mm0 \n\t"\ | |
857 "movq "MANGLE(M24C)", %%mm7 \n\t"\ | |
858 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ | |
859 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ | |
860 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ | |
861 \ | |
862 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ | |
863 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ | |
864 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ | |
865 \ | |
866 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ | |
867 "por %%mm1, %%mm6 \n\t"\ | |
868 "por %%mm3, %%mm6 \n\t"\ | |
869 MOVNTQ(%%mm6, (dst))\ | |
870 \ | |
871 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ | |
872 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ | |
873 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ | |
874 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ | |
875 \ | |
876 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ | |
877 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ | |
878 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ | |
879 \ | |
880 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ | |
881 "por %%mm3, %%mm6 \n\t"\ | |
882 MOVNTQ(%%mm6, 8(dst))\ | |
883 \ | |
884 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ | |
885 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ | |
886 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ | |
887 \ | |
888 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ | |
889 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ | |
890 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ | |
891 \ | |
892 "por %%mm1, %%mm3 \n\t"\ | |
893 "por %%mm3, %%mm6 \n\t"\ | |
894 MOVNTQ(%%mm6, 16(dst))\ | |
895 \ | |
896 "add $24, "#dst" \n\t"\ | |
897 \ | |
898 "add $8, "#index" \n\t"\ | |
899 "cmp "#dstw", "#index" \n\t"\ | |
900 " jb 1b \n\t" | |
901 | |
902 #ifdef HAVE_MMX2 | |
903 #undef WRITEBGR24 | |
904 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index) | |
905 #else | |
906 #undef WRITEBGR24 | |
907 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) | |
908 #endif | |
909 | |
910 #define REAL_WRITEYUY2(dst, dstw, index) \ | |
911 "packuswb %%mm3, %%mm3 \n\t"\ | |
912 "packuswb %%mm4, %%mm4 \n\t"\ | |
913 "packuswb %%mm7, %%mm1 \n\t"\ | |
914 "punpcklbw %%mm4, %%mm3 \n\t"\ | |
915 "movq %%mm1, %%mm7 \n\t"\ | |
916 "punpcklbw %%mm3, %%mm1 \n\t"\ | |
917 "punpckhbw %%mm3, %%mm7 \n\t"\ | |
918 \ | |
919 MOVNTQ(%%mm1, (dst, index, 2))\ | |
920 MOVNTQ(%%mm7, 8(dst, index, 2))\ | |
921 \ | |
922 "add $8, "#index" \n\t"\ | |
923 "cmp "#dstw", "#index" \n\t"\ | |
924 " jb 1b \n\t" | |
925 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) | |
926 | |
927 | |
928 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | |
929 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
930 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) | |
931 { | |
932 #ifdef HAVE_MMX | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
933 if(c->flags & SWS_ACCURATE_RND){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
934 if(uDest){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
935 YSCALEYUV2YV12X_ACCURATE( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
936 YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
937 } |
18861 | 938 |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
939 YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
940 }else{ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
941 if(uDest){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
942 YSCALEYUV2YV12X( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
943 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
944 } |
18861 | 945 |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
946 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
947 } |
18861 | 948 #else |
949 #ifdef HAVE_ALTIVEC | |
950 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, | |
951 chrFilter, chrSrc, chrFilterSize, | |
952 dest, uDest, vDest, dstW, chrDstW); | |
953 #else //HAVE_ALTIVEC | |
954 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, | |
955 chrFilter, chrSrc, chrFilterSize, | |
956 dest, uDest, vDest, dstW, chrDstW); | |
957 #endif //!HAVE_ALTIVEC | |
958 #endif | |
959 } | |
960 | |
961 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | |
962 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
963 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat) | |
964 { | |
965 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, | |
966 chrFilter, chrSrc, chrFilterSize, | |
967 dest, uDest, dstW, chrDstW, dstFormat); | |
968 } | |
969 | |
970 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc, | |
971 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) | |
972 { | |
973 #ifdef HAVE_MMX | |
974 if(uDest != NULL) | |
975 { | |
976 asm volatile( | |
977 YSCALEYUV2YV121 | |
978 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW), | |
979 "g" (-chrDstW) | |
980 : "%"REG_a | |
981 ); | |
982 | |
983 asm volatile( | |
984 YSCALEYUV2YV121 | |
985 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW), | |
986 "g" (-chrDstW) | |
987 : "%"REG_a | |
988 ); | |
989 } | |
990 | |
991 asm volatile( | |
992 YSCALEYUV2YV121 | |
993 :: "r" (lumSrc + dstW), "r" (dest + dstW), | |
994 "g" (-dstW) | |
995 : "%"REG_a | |
996 ); | |
997 #else | |
998 int i; | |
999 for(i=0; i<dstW; i++) | |
1000 { | |
1001 int val= lumSrc[i]>>7; | |
1002 | |
1003 if(val&256){ | |
1004 if(val<0) val=0; | |
1005 else val=255; | |
1006 } | |
1007 | |
1008 dest[i]= val; | |
1009 } | |
1010 | |
1011 if(uDest != NULL) | |
1012 for(i=0; i<chrDstW; i++) | |
1013 { | |
1014 int u=chrSrc[i]>>7; | |
1015 int v=chrSrc[i + 2048]>>7; | |
1016 | |
1017 if((u|v)&256){ | |
1018 if(u<0) u=0; | |
1019 else if (u>255) u=255; | |
1020 if(v<0) v=0; | |
1021 else if (v>255) v=255; | |
1022 } | |
1023 | |
1024 uDest[i]= u; | |
1025 vDest[i]= v; | |
1026 } | |
1027 #endif | |
1028 } | |
1029 | |
1030 | |
1031 /** | |
1032 * vertical scale YV12 to RGB | |
1033 */ | |
1034 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | |
1035 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
1036 uint8_t *dest, long dstW, long dstY) | |
1037 { | |
1038 long dummy=0; | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1039 #ifdef HAVE_MMX |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1040 if(c->flags & SWS_ACCURATE_RND){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1041 switch(c->dstFormat){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1042 case IMGFMT_BGR32: |
19173 | 1043 YSCALEYUV2PACKEDX_ACCURATE |
1044 YSCALEYUV2RGBX | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1045 WRITEBGR32(%4, %5, %%REGa) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1046 |
19173 | 1047 YSCALEYUV2PACKEDX_END |
1048 return; | |
1049 case IMGFMT_BGR24: | |
1050 YSCALEYUV2PACKEDX_ACCURATE | |
1051 YSCALEYUV2RGBX | |
19396 | 1052 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize |
1053 "add %4, %%"REG_c" \n\t" | |
1054 WRITEBGR24(%%REGc, %5, %%REGa) | |
19173 | 1055 |
1056 | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1057 :: "r" (&c->redDither), |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1058 "m" (dummy), "m" (dummy), "m" (dummy), |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1059 "r" (dest), "m" (dstW) |
19396 | 1060 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1061 ); |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1062 return; |
19173 | 1063 case IMGFMT_BGR15: |
1064 YSCALEYUV2PACKEDX_ACCURATE | |
1065 YSCALEYUV2RGBX | |
1066 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1067 #ifdef DITHER1XBPP | |
1068 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1069 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1070 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1071 #endif | |
1072 | |
1073 WRITEBGR15(%4, %5, %%REGa) | |
1074 YSCALEYUV2PACKEDX_END | |
1075 return; | |
1076 case IMGFMT_BGR16: | |
1077 YSCALEYUV2PACKEDX_ACCURATE | |
1078 YSCALEYUV2RGBX | |
1079 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1080 #ifdef DITHER1XBPP | |
1081 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1082 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1083 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1084 #endif | |
1085 | |
1086 WRITEBGR16(%4, %5, %%REGa) | |
1087 YSCALEYUV2PACKEDX_END | |
1088 return; | |
1089 case IMGFMT_YUY2: | |
1090 YSCALEYUV2PACKEDX_ACCURATE | |
1091 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1092 | |
1093 "psraw $3, %%mm3 \n\t" | |
1094 "psraw $3, %%mm4 \n\t" | |
1095 "psraw $3, %%mm1 \n\t" | |
1096 "psraw $3, %%mm7 \n\t" | |
1097 WRITEYUY2(%4, %5, %%REGa) | |
1098 YSCALEYUV2PACKEDX_END | |
1099 return; | |
1100 } | |
1101 }else{ | |
1102 switch(c->dstFormat) | |
1103 { | |
1104 case IMGFMT_BGR32: | |
1105 YSCALEYUV2PACKEDX | |
1106 YSCALEYUV2RGBX | |
1107 WRITEBGR32(%4, %5, %%REGa) | |
1108 YSCALEYUV2PACKEDX_END | |
1109 return; | |
1110 case IMGFMT_BGR24: | |
1111 YSCALEYUV2PACKEDX | |
1112 YSCALEYUV2RGBX | |
19396 | 1113 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize |
1114 "add %4, %%"REG_c" \n\t" | |
1115 WRITEBGR24(%%REGc, %5, %%REGa) | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1116 |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1117 :: "r" (&c->redDither), |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1118 "m" (dummy), "m" (dummy), "m" (dummy), |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1119 "r" (dest), "m" (dstW) |
19396 | 1120 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1121 ); |
19173 | 1122 return; |
1123 case IMGFMT_BGR15: | |
1124 YSCALEYUV2PACKEDX | |
1125 YSCALEYUV2RGBX | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1126 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1127 #ifdef DITHER1XBPP |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1128 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1129 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1130 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1131 #endif |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1132 |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1133 WRITEBGR15(%4, %5, %%REGa) |
19173 | 1134 YSCALEYUV2PACKEDX_END |
1135 return; | |
1136 case IMGFMT_BGR16: | |
1137 YSCALEYUV2PACKEDX | |
1138 YSCALEYUV2RGBX | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1139 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1140 #ifdef DITHER1XBPP |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1141 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1142 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1143 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1144 #endif |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1145 |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1146 WRITEBGR16(%4, %5, %%REGa) |
19173 | 1147 YSCALEYUV2PACKEDX_END |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1148 return; |
18861 | 1149 case IMGFMT_YUY2: |
1150 YSCALEYUV2PACKEDX | |
1151 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1152 | |
1153 "psraw $3, %%mm3 \n\t" | |
1154 "psraw $3, %%mm4 \n\t" | |
1155 "psraw $3, %%mm1 \n\t" | |
1156 "psraw $3, %%mm7 \n\t" | |
1157 WRITEYUY2(%4, %5, %%REGa) | |
19173 | 1158 YSCALEYUV2PACKEDX_END |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1159 return; |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1160 } |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1161 } |
18861 | 1162 #endif |
1163 #ifdef HAVE_ALTIVEC | |
1164 /* The following list of supported dstFormat values should | |
1165 match what's found in the body of altivec_yuv2packedX() */ | |
1166 if(c->dstFormat==IMGFMT_ABGR || c->dstFormat==IMGFMT_BGRA || | |
1167 c->dstFormat==IMGFMT_BGR24 || c->dstFormat==IMGFMT_RGB24 || | |
1168 c->dstFormat==IMGFMT_RGBA || c->dstFormat==IMGFMT_ARGB) | |
1169 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize, | |
1170 chrFilter, chrSrc, chrFilterSize, | |
1171 dest, dstW, dstY); | |
1172 else | |
1173 #endif | |
1174 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, | |
1175 chrFilter, chrSrc, chrFilterSize, | |
1176 dest, dstW, dstY); | |
1177 } | |
1178 | |
1179 /** | |
1180 * vertical bilinear scale YV12 to RGB | |
1181 */ | |
1182 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, | |
1183 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) | |
1184 { | |
1185 int yalpha1=yalpha^4095; | |
1186 int uvalpha1=uvalpha^4095; | |
1187 int i; | |
1188 | |
1189 #if 0 //isn't used | |
1190 if(flags&SWS_FULL_CHR_H_INT) | |
1191 { | |
1192 switch(dstFormat) | |
1193 { | |
1194 #ifdef HAVE_MMX | |
1195 case IMGFMT_BGR32: | |
1196 asm volatile( | |
1197 | |
1198 | |
1199 FULL_YSCALEYUV2RGB | |
1200 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
1201 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
1202 | |
1203 "movq %%mm3, %%mm1 \n\t" | |
1204 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
1205 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
1206 | |
1207 MOVNTQ(%%mm3, (%4, %%REGa, 4)) | |
1208 MOVNTQ(%%mm1, 8(%4, %%REGa, 4)) | |
1209 | |
1210 "add $4, %%"REG_a" \n\t" | |
1211 "cmp %5, %%"REG_a" \n\t" | |
1212 " jb 1b \n\t" | |
1213 | |
1214 | |
1215 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW), | |
1216 "m" (yalpha1), "m" (uvalpha1) | |
1217 : "%"REG_a | |
1218 ); | |
1219 break; | |
1220 case IMGFMT_BGR24: | |
1221 asm volatile( | |
1222 | |
1223 FULL_YSCALEYUV2RGB | |
1224 | |
1225 // lsb ... msb | |
1226 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
1227 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
1228 | |
1229 "movq %%mm3, %%mm1 \n\t" | |
1230 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
1231 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
1232 | |
1233 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0 | |
1234 "psrlq $8, %%mm3 \n\t" // GR0BGR00 | |
1235 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000 | |
1236 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00 | |
1237 "por %%mm2, %%mm3 \n\t" // BGRBGR00 | |
1238 "movq %%mm1, %%mm2 \n\t" | |
1239 "psllq $48, %%mm1 \n\t" // 000000BG | |
1240 "por %%mm1, %%mm3 \n\t" // BGRBGRBG | |
1241 | |
1242 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0 | |
1243 "psrld $16, %%mm2 \n\t" // R000R000 | |
1244 "psrlq $24, %%mm1 \n\t" // 0BGR0000 | |
1245 "por %%mm2, %%mm1 \n\t" // RBGRR000 | |
1246 | |
1247 "mov %4, %%"REG_b" \n\t" | |
1248 "add %%"REG_a", %%"REG_b" \n\t" | |
1249 | |
1250 #ifdef HAVE_MMX2 | |
1251 //FIXME Alignment | |
1252 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t" | |
1253 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t" | |
1254 #else | |
1255 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t" | |
1256 "psrlq $32, %%mm3 \n\t" | |
1257 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t" | |
1258 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t" | |
1259 #endif | |
1260 "add $4, %%"REG_a" \n\t" | |
1261 "cmp %5, %%"REG_a" \n\t" | |
1262 " jb 1b \n\t" | |
1263 | |
1264 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), | |
1265 "m" (yalpha1), "m" (uvalpha1) | |
1266 : "%"REG_a, "%"REG_b | |
1267 ); | |
1268 break; | |
1269 case IMGFMT_BGR15: | |
1270 asm volatile( | |
1271 | |
1272 FULL_YSCALEYUV2RGB | |
1273 #ifdef DITHER1XBPP | |
1274 "paddusb "MANGLE(g5Dither)", %%mm1\n\t" | |
1275 "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
1276 "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
1277 #endif | |
1278 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |
1279 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
1280 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
1281 | |
1282 "psrlw $3, %%mm3 \n\t" | |
1283 "psllw $2, %%mm1 \n\t" | |
1284 "psllw $7, %%mm0 \n\t" | |
1285 "pand "MANGLE(g15Mask)", %%mm1 \n\t" | |
1286 "pand "MANGLE(r15Mask)", %%mm0 \n\t" | |
1287 | |
1288 "por %%mm3, %%mm1 \n\t" | |
1289 "por %%mm1, %%mm0 \n\t" | |
1290 | |
1291 MOVNTQ(%%mm0, (%4, %%REGa, 2)) | |
1292 | |
1293 "add $4, %%"REG_a" \n\t" | |
1294 "cmp %5, %%"REG_a" \n\t" | |
1295 " jb 1b \n\t" | |
1296 | |
1297 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), | |
1298 "m" (yalpha1), "m" (uvalpha1) | |
1299 : "%"REG_a | |
1300 ); | |
1301 break; | |
1302 case IMGFMT_BGR16: | |
1303 asm volatile( | |
1304 | |
1305 FULL_YSCALEYUV2RGB | |
1306 #ifdef DITHER1XBPP | |
1307 "paddusb "MANGLE(g6Dither)", %%mm1\n\t" | |
1308 "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
1309 "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
1310 #endif | |
1311 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |
1312 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
1313 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
1314 | |
1315 "psrlw $3, %%mm3 \n\t" | |
1316 "psllw $3, %%mm1 \n\t" | |
1317 "psllw $8, %%mm0 \n\t" | |
1318 "pand "MANGLE(g16Mask)", %%mm1 \n\t" | |
1319 "pand "MANGLE(r16Mask)", %%mm0 \n\t" | |
1320 | |
1321 "por %%mm3, %%mm1 \n\t" | |
1322 "por %%mm1, %%mm0 \n\t" | |
1323 | |
1324 MOVNTQ(%%mm0, (%4, %%REGa, 2)) | |
1325 | |
1326 "add $4, %%"REG_a" \n\t" | |
1327 "cmp %5, %%"REG_a" \n\t" | |
1328 " jb 1b \n\t" | |
1329 | |
1330 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), | |
1331 "m" (yalpha1), "m" (uvalpha1) | |
1332 : "%"REG_a | |
1333 ); | |
1334 break; | |
1335 #endif | |
1336 case IMGFMT_RGB32: | |
1337 #ifndef HAVE_MMX | |
1338 case IMGFMT_BGR32: | |
1339 #endif | |
1340 if(dstFormat==IMGFMT_BGR32) | |
1341 { | |
1342 int i; | |
1343 #ifdef WORDS_BIGENDIAN | |
1344 dest++; | |
1345 #endif | |
1346 for(i=0;i<dstW;i++){ | |
1347 // vertical linear interpolation && yuv2rgb in a single step: | |
1348 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1349 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1350 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1351 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; | |
1352 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1353 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
1354 dest+= 4; | |
1355 } | |
1356 } | |
1357 else if(dstFormat==IMGFMT_BGR24) | |
1358 { | |
1359 int i; | |
1360 for(i=0;i<dstW;i++){ | |
1361 // vertical linear interpolation && yuv2rgb in a single step: | |
1362 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1363 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1364 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1365 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; | |
1366 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1367 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
1368 dest+= 3; | |
1369 } | |
1370 } | |
1371 else if(dstFormat==IMGFMT_BGR16) | |
1372 { | |
1373 int i; | |
1374 for(i=0;i<dstW;i++){ | |
1375 // vertical linear interpolation && yuv2rgb in a single step: | |
1376 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1377 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1378 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1379 | |
1380 ((uint16_t*)dest)[i] = | |
1381 clip_table16b[(Y + yuvtab_40cf[U]) >>13] | | |
1382 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1383 clip_table16r[(Y + yuvtab_3343[V]) >>13]; | |
1384 } | |
1385 } | |
1386 else if(dstFormat==IMGFMT_BGR15) | |
1387 { | |
1388 int i; | |
1389 for(i=0;i<dstW;i++){ | |
1390 // vertical linear interpolation && yuv2rgb in a single step: | |
1391 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1392 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1393 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1394 | |
1395 ((uint16_t*)dest)[i] = | |
1396 clip_table15b[(Y + yuvtab_40cf[U]) >>13] | | |
1397 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1398 clip_table15r[(Y + yuvtab_3343[V]) >>13]; | |
1399 } | |
1400 } | |
1401 }//FULL_UV_IPOL | |
1402 else | |
1403 { | |
1404 #endif // if 0 | |
1405 #ifdef HAVE_MMX | |
1406 switch(c->dstFormat) | |
1407 { | |
1408 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( | |
1409 case IMGFMT_BGR32: | |
1410 asm volatile( | |
1411 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1412 "mov %4, %%"REG_b" \n\t" | |
1413 "push %%"REG_BP" \n\t" | |
1414 YSCALEYUV2RGB(%%REGBP, %5) | |
1415 WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1416 "pop %%"REG_BP" \n\t" | |
1417 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1418 | |
1419 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1420 "a" (&c->redDither) | |
1421 ); | |
1422 return; | |
1423 case IMGFMT_BGR24: | |
1424 asm volatile( | |
1425 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1426 "mov %4, %%"REG_b" \n\t" | |
1427 "push %%"REG_BP" \n\t" | |
1428 YSCALEYUV2RGB(%%REGBP, %5) | |
1429 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1430 "pop %%"REG_BP" \n\t" | |
1431 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1432 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1433 "a" (&c->redDither) | |
1434 ); | |
1435 return; | |
1436 case IMGFMT_BGR15: | |
1437 asm volatile( | |
1438 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1439 "mov %4, %%"REG_b" \n\t" | |
1440 "push %%"REG_BP" \n\t" | |
1441 YSCALEYUV2RGB(%%REGBP, %5) | |
1442 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1443 #ifdef DITHER1XBPP | |
1444 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1445 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1446 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1447 #endif | |
1448 | |
1449 WRITEBGR15(%%REGb, 8280(%5), %%REGBP) | |
1450 "pop %%"REG_BP" \n\t" | |
1451 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1452 | |
1453 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1454 "a" (&c->redDither) | |
1455 ); | |
1456 return; | |
1457 case IMGFMT_BGR16: | |
1458 asm volatile( | |
1459 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1460 "mov %4, %%"REG_b" \n\t" | |
1461 "push %%"REG_BP" \n\t" | |
1462 YSCALEYUV2RGB(%%REGBP, %5) | |
1463 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1464 #ifdef DITHER1XBPP | |
1465 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1466 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1467 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1468 #endif | |
1469 | |
1470 WRITEBGR16(%%REGb, 8280(%5), %%REGBP) | |
1471 "pop %%"REG_BP" \n\t" | |
1472 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1473 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1474 "a" (&c->redDither) | |
1475 ); | |
1476 return; | |
1477 case IMGFMT_YUY2: | |
1478 asm volatile( | |
1479 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1480 "mov %4, %%"REG_b" \n\t" | |
1481 "push %%"REG_BP" \n\t" | |
1482 YSCALEYUV2PACKED(%%REGBP, %5) | |
1483 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1484 "pop %%"REG_BP" \n\t" | |
1485 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1486 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1487 "a" (&c->redDither) | |
1488 ); | |
1489 return; | |
1490 default: break; | |
1491 } | |
1492 #endif //HAVE_MMX | |
1493 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C) | |
1494 } | |
1495 | |
1496 /** | |
1497 * YV12 to RGB without scaling or interpolating | |
1498 */ | |
1499 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, | |
1500 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y) | |
1501 { | |
1502 const int yalpha1=0; | |
1503 int i; | |
1504 | |
1505 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1 | |
1506 const int yalpha= 4096; //FIXME ... | |
1507 | |
1508 if(flags&SWS_FULL_CHR_H_INT) | |
1509 { | |
1510 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y); | |
1511 return; | |
1512 } | |
1513 | |
1514 #ifdef HAVE_MMX | |
1515 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster | |
1516 { | |
1517 switch(dstFormat) | |
1518 { | |
1519 case IMGFMT_BGR32: | |
1520 asm volatile( | |
1521 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1522 "mov %4, %%"REG_b" \n\t" | |
1523 "push %%"REG_BP" \n\t" | |
1524 YSCALEYUV2RGB1(%%REGBP, %5) | |
1525 WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1526 "pop %%"REG_BP" \n\t" | |
1527 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1528 | |
1529 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1530 "a" (&c->redDither) | |
1531 ); | |
1532 return; | |
1533 case IMGFMT_BGR24: | |
1534 asm volatile( | |
1535 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1536 "mov %4, %%"REG_b" \n\t" | |
1537 "push %%"REG_BP" \n\t" | |
1538 YSCALEYUV2RGB1(%%REGBP, %5) | |
1539 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1540 "pop %%"REG_BP" \n\t" | |
1541 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1542 | |
1543 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1544 "a" (&c->redDither) | |
1545 ); | |
1546 return; | |
1547 case IMGFMT_BGR15: | |
1548 asm volatile( | |
1549 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1550 "mov %4, %%"REG_b" \n\t" | |
1551 "push %%"REG_BP" \n\t" | |
1552 YSCALEYUV2RGB1(%%REGBP, %5) | |
1553 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1554 #ifdef DITHER1XBPP | |
1555 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1556 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1557 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1558 #endif | |
1559 WRITEBGR15(%%REGb, 8280(%5), %%REGBP) | |
1560 "pop %%"REG_BP" \n\t" | |
1561 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1562 | |
1563 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1564 "a" (&c->redDither) | |
1565 ); | |
1566 return; | |
1567 case IMGFMT_BGR16: | |
1568 asm volatile( | |
1569 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1570 "mov %4, %%"REG_b" \n\t" | |
1571 "push %%"REG_BP" \n\t" | |
1572 YSCALEYUV2RGB1(%%REGBP, %5) | |
1573 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1574 #ifdef DITHER1XBPP | |
1575 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1576 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1577 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1578 #endif | |
1579 | |
1580 WRITEBGR16(%%REGb, 8280(%5), %%REGBP) | |
1581 "pop %%"REG_BP" \n\t" | |
1582 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1583 | |
1584 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1585 "a" (&c->redDither) | |
1586 ); | |
1587 return; | |
1588 case IMGFMT_YUY2: | |
1589 asm volatile( | |
1590 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1591 "mov %4, %%"REG_b" \n\t" | |
1592 "push %%"REG_BP" \n\t" | |
1593 YSCALEYUV2PACKED1(%%REGBP, %5) | |
1594 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1595 "pop %%"REG_BP" \n\t" | |
1596 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1597 | |
1598 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1599 "a" (&c->redDither) | |
1600 ); | |
1601 return; | |
1602 } | |
1603 } | |
1604 else | |
1605 { | |
1606 switch(dstFormat) | |
1607 { | |
1608 case IMGFMT_BGR32: | |
1609 asm volatile( | |
1610 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1611 "mov %4, %%"REG_b" \n\t" | |
1612 "push %%"REG_BP" \n\t" | |
1613 YSCALEYUV2RGB1b(%%REGBP, %5) | |
1614 WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1615 "pop %%"REG_BP" \n\t" | |
1616 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1617 | |
1618 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1619 "a" (&c->redDither) | |
1620 ); | |
1621 return; | |
1622 case IMGFMT_BGR24: | |
1623 asm volatile( | |
1624 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1625 "mov %4, %%"REG_b" \n\t" | |
1626 "push %%"REG_BP" \n\t" | |
1627 YSCALEYUV2RGB1b(%%REGBP, %5) | |
1628 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1629 "pop %%"REG_BP" \n\t" | |
1630 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1631 | |
1632 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1633 "a" (&c->redDither) | |
1634 ); | |
1635 return; | |
1636 case IMGFMT_BGR15: | |
1637 asm volatile( | |
1638 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1639 "mov %4, %%"REG_b" \n\t" | |
1640 "push %%"REG_BP" \n\t" | |
1641 YSCALEYUV2RGB1b(%%REGBP, %5) | |
1642 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1643 #ifdef DITHER1XBPP | |
1644 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1645 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1646 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1647 #endif | |
1648 WRITEBGR15(%%REGb, 8280(%5), %%REGBP) | |
1649 "pop %%"REG_BP" \n\t" | |
1650 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1651 | |
1652 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1653 "a" (&c->redDither) | |
1654 ); | |
1655 return; | |
1656 case IMGFMT_BGR16: | |
1657 asm volatile( | |
1658 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1659 "mov %4, %%"REG_b" \n\t" | |
1660 "push %%"REG_BP" \n\t" | |
1661 YSCALEYUV2RGB1b(%%REGBP, %5) | |
1662 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1663 #ifdef DITHER1XBPP | |
1664 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1665 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1666 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1667 #endif | |
1668 | |
1669 WRITEBGR16(%%REGb, 8280(%5), %%REGBP) | |
1670 "pop %%"REG_BP" \n\t" | |
1671 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1672 | |
1673 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1674 "a" (&c->redDither) | |
1675 ); | |
1676 return; | |
1677 case IMGFMT_YUY2: | |
1678 asm volatile( | |
1679 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1680 "mov %4, %%"REG_b" \n\t" | |
1681 "push %%"REG_BP" \n\t" | |
1682 YSCALEYUV2PACKED1b(%%REGBP, %5) | |
1683 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1684 "pop %%"REG_BP" \n\t" | |
1685 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1686 | |
1687 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1688 "a" (&c->redDither) | |
1689 ); | |
1690 return; | |
1691 } | |
1692 } | |
1693 #endif | |
1694 if( uvalpha < 2048 ) | |
1695 { | |
1696 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C) | |
1697 }else{ | |
1698 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C) | |
1699 } | |
1700 } | |
1701 | |
1702 //FIXME yuy2* can read upto 7 samples to much | |
1703 | |
1704 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width) | |
1705 { | |
1706 #ifdef HAVE_MMX | |
1707 asm volatile( | |
1708 "movq "MANGLE(bm01010101)", %%mm2\n\t" | |
1709 "mov %0, %%"REG_a" \n\t" | |
1710 "1: \n\t" | |
1711 "movq (%1, %%"REG_a",2), %%mm0 \n\t" | |
1712 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |
1713 "pand %%mm2, %%mm0 \n\t" | |
1714 "pand %%mm2, %%mm1 \n\t" | |
1715 "packuswb %%mm1, %%mm0 \n\t" | |
1716 "movq %%mm0, (%2, %%"REG_a") \n\t" | |
1717 "add $8, %%"REG_a" \n\t" | |
1718 " js 1b \n\t" | |
1719 : : "g" (-width), "r" (src+width*2), "r" (dst+width) | |
1720 : "%"REG_a | |
1721 ); | |
1722 #else | |
1723 int i; | |
1724 for(i=0; i<width; i++) | |
1725 dst[i]= src[2*i]; | |
1726 #endif | |
1727 } | |
1728 | |
1729 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) | |
1730 { | |
1731 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1732 asm volatile( | |
1733 "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
1734 "mov %0, %%"REG_a" \n\t" | |
1735 "1: \n\t" | |
1736 "movq (%1, %%"REG_a",4), %%mm0 \n\t" | |
1737 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | |
1738 "movq (%2, %%"REG_a",4), %%mm2 \n\t" | |
1739 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t" | |
1740 PAVGB(%%mm2, %%mm0) | |
1741 PAVGB(%%mm3, %%mm1) | |
1742 "psrlw $8, %%mm0 \n\t" | |
1743 "psrlw $8, %%mm1 \n\t" | |
1744 "packuswb %%mm1, %%mm0 \n\t" | |
1745 "movq %%mm0, %%mm1 \n\t" | |
1746 "psrlw $8, %%mm0 \n\t" | |
1747 "pand %%mm4, %%mm1 \n\t" | |
1748 "packuswb %%mm0, %%mm0 \n\t" | |
1749 "packuswb %%mm1, %%mm1 \n\t" | |
1750 "movd %%mm0, (%4, %%"REG_a") \n\t" | |
1751 "movd %%mm1, (%3, %%"REG_a") \n\t" | |
1752 "add $4, %%"REG_a" \n\t" | |
1753 " js 1b \n\t" | |
1754 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width) | |
1755 : "%"REG_a | |
1756 ); | |
1757 #else | |
1758 int i; | |
1759 for(i=0; i<width; i++) | |
1760 { | |
1761 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1; | |
1762 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1; | |
1763 } | |
1764 #endif | |
1765 } | |
1766 | |
1767 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses | |
1768 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width) | |
1769 { | |
1770 #ifdef HAVE_MMX | |
1771 asm volatile( | |
1772 "mov %0, %%"REG_a" \n\t" | |
1773 "1: \n\t" | |
1774 "movq (%1, %%"REG_a",2), %%mm0 \n\t" | |
1775 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |
1776 "psrlw $8, %%mm0 \n\t" | |
1777 "psrlw $8, %%mm1 \n\t" | |
1778 "packuswb %%mm1, %%mm0 \n\t" | |
1779 "movq %%mm0, (%2, %%"REG_a") \n\t" | |
1780 "add $8, %%"REG_a" \n\t" | |
1781 " js 1b \n\t" | |
1782 : : "g" (-width), "r" (src+width*2), "r" (dst+width) | |
1783 : "%"REG_a | |
1784 ); | |
1785 #else | |
1786 int i; | |
1787 for(i=0; i<width; i++) | |
1788 dst[i]= src[2*i+1]; | |
1789 #endif | |
1790 } | |
1791 | |
1792 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) | |
1793 { | |
1794 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1795 asm volatile( | |
1796 "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
1797 "mov %0, %%"REG_a" \n\t" | |
1798 "1: \n\t" | |
1799 "movq (%1, %%"REG_a",4), %%mm0 \n\t" | |
1800 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | |
1801 "movq (%2, %%"REG_a",4), %%mm2 \n\t" | |
1802 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t" | |
1803 PAVGB(%%mm2, %%mm0) | |
1804 PAVGB(%%mm3, %%mm1) | |
1805 "pand %%mm4, %%mm0 \n\t" | |
1806 "pand %%mm4, %%mm1 \n\t" | |
1807 "packuswb %%mm1, %%mm0 \n\t" | |
1808 "movq %%mm0, %%mm1 \n\t" | |
1809 "psrlw $8, %%mm0 \n\t" | |
1810 "pand %%mm4, %%mm1 \n\t" | |
1811 "packuswb %%mm0, %%mm0 \n\t" | |
1812 "packuswb %%mm1, %%mm1 \n\t" | |
1813 "movd %%mm0, (%4, %%"REG_a") \n\t" | |
1814 "movd %%mm1, (%3, %%"REG_a") \n\t" | |
1815 "add $4, %%"REG_a" \n\t" | |
1816 " js 1b \n\t" | |
1817 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width) | |
1818 : "%"REG_a | |
1819 ); | |
1820 #else | |
1821 int i; | |
1822 for(i=0; i<width; i++) | |
1823 { | |
1824 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1; | |
1825 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1; | |
1826 } | |
1827 #endif | |
1828 } | |
1829 | |
1830 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width) | |
1831 { | |
1832 int i; | |
1833 for(i=0; i<width; i++) | |
1834 { | |
1835 int b= ((uint32_t*)src)[i]&0xFF; | |
1836 int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
1837 int r= (((uint32_t*)src)[i]>>16)&0xFF; | |
1838 | |
1839 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); | |
1840 } | |
1841 } | |
1842 | |
1843 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1844 { | |
1845 int i; | |
1846 for(i=0; i<width; i++) | |
1847 { | |
1848 const int a= ((uint32_t*)src1)[2*i+0]; | |
1849 const int e= ((uint32_t*)src1)[2*i+1]; | |
1850 const int c= ((uint32_t*)src2)[2*i+0]; | |
1851 const int d= ((uint32_t*)src2)[2*i+1]; | |
1852 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF); | |
1853 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00); | |
1854 const int b= l&0x3FF; | |
1855 const int g= h>>8; | |
1856 const int r= l>>16; | |
1857 | |
1858 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1859 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1860 } | |
1861 } | |
1862 | |
1863 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width) | |
1864 { | |
1865 #ifdef HAVE_MMX | |
1866 asm volatile( | |
1867 "mov %2, %%"REG_a" \n\t" | |
1868 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" | |
1869 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
1870 "pxor %%mm7, %%mm7 \n\t" | |
19396 | 1871 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t" |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
1872 ASMALIGN(4) |
18861 | 1873 "1: \n\t" |
19396 | 1874 PREFETCH" 64(%0, %%"REG_d") \n\t" |
1875 "movd (%0, %%"REG_d"), %%mm0 \n\t" | |
1876 "movd 3(%0, %%"REG_d"), %%mm1 \n\t" | |
18861 | 1877 "punpcklbw %%mm7, %%mm0 \n\t" |
1878 "punpcklbw %%mm7, %%mm1 \n\t" | |
19396 | 1879 "movd 6(%0, %%"REG_d"), %%mm2 \n\t" |
1880 "movd 9(%0, %%"REG_d"), %%mm3 \n\t" | |
18861 | 1881 "punpcklbw %%mm7, %%mm2 \n\t" |
1882 "punpcklbw %%mm7, %%mm3 \n\t" | |
1883 "pmaddwd %%mm6, %%mm0 \n\t" | |
1884 "pmaddwd %%mm6, %%mm1 \n\t" | |
1885 "pmaddwd %%mm6, %%mm2 \n\t" | |
1886 "pmaddwd %%mm6, %%mm3 \n\t" | |
1887 #ifndef FAST_BGR2YV12 | |
1888 "psrad $8, %%mm0 \n\t" | |
1889 "psrad $8, %%mm1 \n\t" | |
1890 "psrad $8, %%mm2 \n\t" | |
1891 "psrad $8, %%mm3 \n\t" | |
1892 #endif | |
1893 "packssdw %%mm1, %%mm0 \n\t" | |
1894 "packssdw %%mm3, %%mm2 \n\t" | |
1895 "pmaddwd %%mm5, %%mm0 \n\t" | |
1896 "pmaddwd %%mm5, %%mm2 \n\t" | |
1897 "packssdw %%mm2, %%mm0 \n\t" | |
1898 "psraw $7, %%mm0 \n\t" | |
1899 | |
19396 | 1900 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" |
1901 "movd 15(%0, %%"REG_d"), %%mm1 \n\t" | |
18861 | 1902 "punpcklbw %%mm7, %%mm4 \n\t" |
1903 "punpcklbw %%mm7, %%mm1 \n\t" | |
19396 | 1904 "movd 18(%0, %%"REG_d"), %%mm2 \n\t" |
1905 "movd 21(%0, %%"REG_d"), %%mm3 \n\t" | |
18861 | 1906 "punpcklbw %%mm7, %%mm2 \n\t" |
1907 "punpcklbw %%mm7, %%mm3 \n\t" | |
1908 "pmaddwd %%mm6, %%mm4 \n\t" | |
1909 "pmaddwd %%mm6, %%mm1 \n\t" | |
1910 "pmaddwd %%mm6, %%mm2 \n\t" | |
1911 "pmaddwd %%mm6, %%mm3 \n\t" | |
1912 #ifndef FAST_BGR2YV12 | |
1913 "psrad $8, %%mm4 \n\t" | |
1914 "psrad $8, %%mm1 \n\t" | |
1915 "psrad $8, %%mm2 \n\t" | |
1916 "psrad $8, %%mm3 \n\t" | |
1917 #endif | |
1918 "packssdw %%mm1, %%mm4 \n\t" | |
1919 "packssdw %%mm3, %%mm2 \n\t" | |
1920 "pmaddwd %%mm5, %%mm4 \n\t" | |
1921 "pmaddwd %%mm5, %%mm2 \n\t" | |
19396 | 1922 "add $24, %%"REG_d" \n\t" |
18861 | 1923 "packssdw %%mm2, %%mm4 \n\t" |
1924 "psraw $7, %%mm4 \n\t" | |
1925 | |
1926 "packuswb %%mm4, %%mm0 \n\t" | |
1927 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" | |
1928 | |
1929 "movq %%mm0, (%1, %%"REG_a") \n\t" | |
1930 "add $8, %%"REG_a" \n\t" | |
1931 " js 1b \n\t" | |
1932 : : "r" (src+width*3), "r" (dst+width), "g" (-width) | |
19396 | 1933 : "%"REG_a, "%"REG_d |
18861 | 1934 ); |
1935 #else | |
1936 int i; | |
1937 for(i=0; i<width; i++) | |
1938 { | |
1939 int b= src[i*3+0]; | |
1940 int g= src[i*3+1]; | |
1941 int r= src[i*3+2]; | |
1942 | |
1943 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); | |
1944 } | |
1945 #endif | |
1946 } | |
1947 | |
1948 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) | |
1949 { | |
1950 #ifdef HAVE_MMX | |
1951 asm volatile( | |
1952 "mov %4, %%"REG_a" \n\t" | |
1953 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
1954 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" | |
1955 "pxor %%mm7, %%mm7 \n\t" | |
19396 | 1956 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" |
1957 "add %%"REG_d", %%"REG_d" \n\t" | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
1958 ASMALIGN(4) |
18861 | 1959 "1: \n\t" |
19396 | 1960 PREFETCH" 64(%0, %%"REG_d") \n\t" |
1961 PREFETCH" 64(%1, %%"REG_d") \n\t" | |
18861 | 1962 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
19396 | 1963 "movq (%0, %%"REG_d"), %%mm0 \n\t" |
1964 "movq (%1, %%"REG_d"), %%mm1 \n\t" | |
1965 "movq 6(%0, %%"REG_d"), %%mm2 \n\t" | |
1966 "movq 6(%1, %%"REG_d"), %%mm3 \n\t" | |
18861 | 1967 PAVGB(%%mm1, %%mm0) |
1968 PAVGB(%%mm3, %%mm2) | |
1969 "movq %%mm0, %%mm1 \n\t" | |
1970 "movq %%mm2, %%mm3 \n\t" | |
1971 "psrlq $24, %%mm0 \n\t" | |
1972 "psrlq $24, %%mm2 \n\t" | |
1973 PAVGB(%%mm1, %%mm0) | |
1974 PAVGB(%%mm3, %%mm2) | |
1975 "punpcklbw %%mm7, %%mm0 \n\t" | |
1976 "punpcklbw %%mm7, %%mm2 \n\t" | |
1977 #else | |
19396 | 1978 "movd (%0, %%"REG_d"), %%mm0 \n\t" |
1979 "movd (%1, %%"REG_d"), %%mm1 \n\t" | |
1980 "movd 3(%0, %%"REG_d"), %%mm2 \n\t" | |
1981 "movd 3(%1, %%"REG_d"), %%mm3 \n\t" | |
18861 | 1982 "punpcklbw %%mm7, %%mm0 \n\t" |
1983 "punpcklbw %%mm7, %%mm1 \n\t" | |
1984 "punpcklbw %%mm7, %%mm2 \n\t" | |
1985 "punpcklbw %%mm7, %%mm3 \n\t" | |
1986 "paddw %%mm1, %%mm0 \n\t" | |
1987 "paddw %%mm3, %%mm2 \n\t" | |
1988 "paddw %%mm2, %%mm0 \n\t" | |
19396 | 1989 "movd 6(%0, %%"REG_d"), %%mm4 \n\t" |
1990 "movd 6(%1, %%"REG_d"), %%mm1 \n\t" | |
1991 "movd 9(%0, %%"REG_d"), %%mm2 \n\t" | |
1992 "movd 9(%1, %%"REG_d"), %%mm3 \n\t" | |
18861 | 1993 "punpcklbw %%mm7, %%mm4 \n\t" |
1994 "punpcklbw %%mm7, %%mm1 \n\t" | |
1995 "punpcklbw %%mm7, %%mm2 \n\t" | |
1996 "punpcklbw %%mm7, %%mm3 \n\t" | |
1997 "paddw %%mm1, %%mm4 \n\t" | |
1998 "paddw %%mm3, %%mm2 \n\t" | |
1999 "paddw %%mm4, %%mm2 \n\t" | |
2000 "psrlw $2, %%mm0 \n\t" | |
2001 "psrlw $2, %%mm2 \n\t" | |
2002 #endif | |
2003 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" | |
2004 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
2005 | |
2006 "pmaddwd %%mm0, %%mm1 \n\t" | |
2007 "pmaddwd %%mm2, %%mm3 \n\t" | |
2008 "pmaddwd %%mm6, %%mm0 \n\t" | |
2009 "pmaddwd %%mm6, %%mm2 \n\t" | |
2010 #ifndef FAST_BGR2YV12 | |
2011 "psrad $8, %%mm0 \n\t" | |
2012 "psrad $8, %%mm1 \n\t" | |
2013 "psrad $8, %%mm2 \n\t" | |
2014 "psrad $8, %%mm3 \n\t" | |
2015 #endif | |
2016 "packssdw %%mm2, %%mm0 \n\t" | |
2017 "packssdw %%mm3, %%mm1 \n\t" | |
2018 "pmaddwd %%mm5, %%mm0 \n\t" | |
2019 "pmaddwd %%mm5, %%mm1 \n\t" | |
2020 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
2021 "psraw $7, %%mm0 \n\t" | |
2022 | |
2023 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
19396 | 2024 "movq 12(%0, %%"REG_d"), %%mm4 \n\t" |
2025 "movq 12(%1, %%"REG_d"), %%mm1 \n\t" | |
2026 "movq 18(%0, %%"REG_d"), %%mm2 \n\t" | |
2027 "movq 18(%1, %%"REG_d"), %%mm3 \n\t" | |
18861 | 2028 PAVGB(%%mm1, %%mm4) |
2029 PAVGB(%%mm3, %%mm2) | |
2030 "movq %%mm4, %%mm1 \n\t" | |
2031 "movq %%mm2, %%mm3 \n\t" | |
2032 "psrlq $24, %%mm4 \n\t" | |
2033 "psrlq $24, %%mm2 \n\t" | |
2034 PAVGB(%%mm1, %%mm4) | |
2035 PAVGB(%%mm3, %%mm2) | |
2036 "punpcklbw %%mm7, %%mm4 \n\t" | |
2037 "punpcklbw %%mm7, %%mm2 \n\t" | |
2038 #else | |
19396 | 2039 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" |
2040 "movd 12(%1, %%"REG_d"), %%mm1 \n\t" | |
2041 "movd 15(%0, %%"REG_d"), %%mm2 \n\t" | |
2042 "movd 15(%1, %%"REG_d"), %%mm3 \n\t" | |
18861 | 2043 "punpcklbw %%mm7, %%mm4 \n\t" |
2044 "punpcklbw %%mm7, %%mm1 \n\t" | |
2045 "punpcklbw %%mm7, %%mm2 \n\t" | |
2046 "punpcklbw %%mm7, %%mm3 \n\t" | |
2047 "paddw %%mm1, %%mm4 \n\t" | |
2048 "paddw %%mm3, %%mm2 \n\t" | |
2049 "paddw %%mm2, %%mm4 \n\t" | |
19396 | 2050 "movd 18(%0, %%"REG_d"), %%mm5 \n\t" |
2051 "movd 18(%1, %%"REG_d"), %%mm1 \n\t" | |
2052 "movd 21(%0, %%"REG_d"), %%mm2 \n\t" | |
2053 "movd 21(%1, %%"REG_d"), %%mm3 \n\t" | |
18861 | 2054 "punpcklbw %%mm7, %%mm5 \n\t" |
2055 "punpcklbw %%mm7, %%mm1 \n\t" | |
2056 "punpcklbw %%mm7, %%mm2 \n\t" | |
2057 "punpcklbw %%mm7, %%mm3 \n\t" | |
2058 "paddw %%mm1, %%mm5 \n\t" | |
2059 "paddw %%mm3, %%mm2 \n\t" | |
2060 "paddw %%mm5, %%mm2 \n\t" | |
2061 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
2062 "psrlw $2, %%mm4 \n\t" | |
2063 "psrlw $2, %%mm2 \n\t" | |
2064 #endif | |
2065 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" | |
2066 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
2067 | |
2068 "pmaddwd %%mm4, %%mm1 \n\t" | |
2069 "pmaddwd %%mm2, %%mm3 \n\t" | |
2070 "pmaddwd %%mm6, %%mm4 \n\t" | |
2071 "pmaddwd %%mm6, %%mm2 \n\t" | |
2072 #ifndef FAST_BGR2YV12 | |
2073 "psrad $8, %%mm4 \n\t" | |
2074 "psrad $8, %%mm1 \n\t" | |
2075 "psrad $8, %%mm2 \n\t" | |
2076 "psrad $8, %%mm3 \n\t" | |
2077 #endif | |
2078 "packssdw %%mm2, %%mm4 \n\t" | |
2079 "packssdw %%mm3, %%mm1 \n\t" | |
2080 "pmaddwd %%mm5, %%mm4 \n\t" | |
2081 "pmaddwd %%mm5, %%mm1 \n\t" | |
19396 | 2082 "add $24, %%"REG_d" \n\t" |
18861 | 2083 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 |
2084 "psraw $7, %%mm4 \n\t" | |
2085 | |
2086 "movq %%mm0, %%mm1 \n\t" | |
2087 "punpckldq %%mm4, %%mm0 \n\t" | |
2088 "punpckhdq %%mm4, %%mm1 \n\t" | |
2089 "packsswb %%mm1, %%mm0 \n\t" | |
2090 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" | |
2091 | |
2092 "movd %%mm0, (%2, %%"REG_a") \n\t" | |
2093 "punpckhdq %%mm0, %%mm0 \n\t" | |
2094 "movd %%mm0, (%3, %%"REG_a") \n\t" | |
2095 "add $4, %%"REG_a" \n\t" | |
2096 " js 1b \n\t" | |
2097 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width) | |
19396 | 2098 : "%"REG_a, "%"REG_d |
18861 | 2099 ); |
2100 #else | |
2101 int i; | |
2102 for(i=0; i<width; i++) | |
2103 { | |
2104 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3]; | |
2105 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4]; | |
2106 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5]; | |
2107 | |
2108 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2109 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2110 } | |
2111 #endif | |
2112 } | |
2113 | |
2114 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width) | |
2115 { | |
2116 int i; | |
2117 for(i=0; i<width; i++) | |
2118 { | |
2119 int d= ((uint16_t*)src)[i]; | |
2120 int b= d&0x1F; | |
2121 int g= (d>>5)&0x3F; | |
2122 int r= (d>>11)&0x1F; | |
2123 | |
2124 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16; | |
2125 } | |
2126 } | |
2127 | |
2128 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2129 { | |
2130 int i; | |
2131 for(i=0; i<width; i++) | |
2132 { | |
2133 int d0= ((uint32_t*)src1)[i]; | |
2134 int d1= ((uint32_t*)src2)[i]; | |
2135 | |
2136 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F); | |
2137 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F); | |
2138 | |
2139 int dh2= (dh>>11) + (dh<<21); | |
2140 int d= dh2 + dl; | |
2141 | |
2142 int b= d&0x7F; | |
2143 int r= (d>>11)&0x7F; | |
2144 int g= d>>21; | |
2145 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128; | |
2146 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128; | |
2147 } | |
2148 } | |
2149 | |
2150 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width) | |
2151 { | |
2152 int i; | |
2153 for(i=0; i<width; i++) | |
2154 { | |
2155 int d= ((uint16_t*)src)[i]; | |
2156 int b= d&0x1F; | |
2157 int g= (d>>5)&0x1F; | |
2158 int r= (d>>10)&0x1F; | |
2159 | |
2160 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16; | |
2161 } | |
2162 } | |
2163 | |
2164 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2165 { | |
2166 int i; | |
2167 for(i=0; i<width; i++) | |
2168 { | |
2169 int d0= ((uint32_t*)src1)[i]; | |
2170 int d1= ((uint32_t*)src2)[i]; | |
2171 | |
2172 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F); | |
2173 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F); | |
2174 | |
2175 int dh2= (dh>>11) + (dh<<21); | |
2176 int d= dh2 + dl; | |
2177 | |
2178 int b= d&0x7F; | |
2179 int r= (d>>10)&0x7F; | |
2180 int g= d>>21; | |
2181 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128; | |
2182 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128; | |
2183 } | |
2184 } | |
2185 | |
2186 | |
2187 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width) | |
2188 { | |
2189 int i; | |
2190 for(i=0; i<width; i++) | |
2191 { | |
2192 int r= ((uint32_t*)src)[i]&0xFF; | |
2193 int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
2194 int b= (((uint32_t*)src)[i]>>16)&0xFF; | |
2195 | |
2196 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); | |
2197 } | |
2198 } | |
2199 | |
2200 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2201 { | |
2202 int i; | |
2203 for(i=0; i<width; i++) | |
2204 { | |
2205 const int a= ((uint32_t*)src1)[2*i+0]; | |
2206 const int e= ((uint32_t*)src1)[2*i+1]; | |
2207 const int c= ((uint32_t*)src2)[2*i+0]; | |
2208 const int d= ((uint32_t*)src2)[2*i+1]; | |
2209 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF); | |
2210 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00); | |
2211 const int r= l&0x3FF; | |
2212 const int g= h>>8; | |
2213 const int b= l>>16; | |
2214 | |
2215 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2216 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2217 } | |
2218 } | |
2219 | |
2220 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width) | |
2221 { | |
2222 int i; | |
2223 for(i=0; i<width; i++) | |
2224 { | |
2225 int r= src[i*3+0]; | |
2226 int g= src[i*3+1]; | |
2227 int b= src[i*3+2]; | |
2228 | |
2229 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); | |
2230 } | |
2231 } | |
2232 | |
2233 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2234 { | |
2235 int i; | |
2236 for(i=0; i<width; i++) | |
2237 { | |
2238 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3]; | |
2239 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4]; | |
2240 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5]; | |
2241 | |
2242 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2243 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2244 } | |
2245 } | |
2246 | |
2247 | |
2248 // Bilinear / Bicubic scaling | |
2249 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, | |
2250 int16_t *filter, int16_t *filterPos, long filterSize) | |
2251 { | |
2252 #ifdef HAVE_MMX | |
2253 assert(filterSize % 4 == 0 && filterSize>0); | |
2254 if(filterSize==4) // allways true for upscaling, sometimes for down too | |
2255 { | |
2256 long counter= -2*dstW; | |
2257 filter-= counter*2; | |
2258 filterPos-= counter/2; | |
2259 dst-= counter/2; | |
2260 asm volatile( | |
19396 | 2261 #if defined(PIC) |
2262 "push %%"REG_b" \n\t" | |
2263 #endif | |
18861 | 2264 "pxor %%mm7, %%mm7 \n\t" |
2265 "movq "MANGLE(w02)", %%mm6 \n\t" | |
2266 "push %%"REG_BP" \n\t" // we use 7 regs here ... | |
2267 "mov %%"REG_a", %%"REG_BP" \n\t" | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
2268 ASMALIGN(4) |
18861 | 2269 "1: \n\t" |
2270 "movzwl (%2, %%"REG_BP"), %%eax \n\t" | |
2271 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t" | |
2272 "movq (%1, %%"REG_BP", 4), %%mm1\n\t" | |
2273 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t" | |
2274 "movd (%3, %%"REG_a"), %%mm0 \n\t" | |
2275 "movd (%3, %%"REG_b"), %%mm2 \n\t" | |
2276 "punpcklbw %%mm7, %%mm0 \n\t" | |
2277 "punpcklbw %%mm7, %%mm2 \n\t" | |
2278 "pmaddwd %%mm1, %%mm0 \n\t" | |
2279 "pmaddwd %%mm2, %%mm3 \n\t" | |
2280 "psrad $8, %%mm0 \n\t" | |
2281 "psrad $8, %%mm3 \n\t" | |
2282 "packssdw %%mm3, %%mm0 \n\t" | |
2283 "pmaddwd %%mm6, %%mm0 \n\t" | |
2284 "packssdw %%mm0, %%mm0 \n\t" | |
2285 "movd %%mm0, (%4, %%"REG_BP") \n\t" | |
2286 "add $4, %%"REG_BP" \n\t" | |
2287 " jnc 1b \n\t" | |
2288 | |
2289 "pop %%"REG_BP" \n\t" | |
19396 | 2290 #if defined(PIC) |
2291 "pop %%"REG_b" \n\t" | |
2292 #endif | |
18861 | 2293 : "+a" (counter) |
2294 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
19396 | 2295 #if !defined(PIC) |
18861 | 2296 : "%"REG_b |
19396 | 2297 #endif |
18861 | 2298 ); |
2299 } | |
2300 else if(filterSize==8) | |
2301 { | |
2302 long counter= -2*dstW; | |
2303 filter-= counter*4; | |
2304 filterPos-= counter/2; | |
2305 dst-= counter/2; | |
2306 asm volatile( | |
19396 | 2307 #if defined(PIC) |
2308 "push %%"REG_b" \n\t" | |
2309 #endif | |
18861 | 2310 "pxor %%mm7, %%mm7 \n\t" |
2311 "movq "MANGLE(w02)", %%mm6 \n\t" | |
2312 "push %%"REG_BP" \n\t" // we use 7 regs here ... | |
2313 "mov %%"REG_a", %%"REG_BP" \n\t" | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
2314 ASMALIGN(4) |
18861 | 2315 "1: \n\t" |
2316 "movzwl (%2, %%"REG_BP"), %%eax \n\t" | |
2317 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t" | |
2318 "movq (%1, %%"REG_BP", 8), %%mm1\n\t" | |
2319 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t" | |
2320 "movd (%3, %%"REG_a"), %%mm0 \n\t" | |
2321 "movd (%3, %%"REG_b"), %%mm2 \n\t" | |
2322 "punpcklbw %%mm7, %%mm0 \n\t" | |
2323 "punpcklbw %%mm7, %%mm2 \n\t" | |
2324 "pmaddwd %%mm1, %%mm0 \n\t" | |
2325 "pmaddwd %%mm2, %%mm3 \n\t" | |
2326 | |
2327 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t" | |
2328 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t" | |
2329 "movd 4(%3, %%"REG_a"), %%mm4 \n\t" | |
2330 "movd 4(%3, %%"REG_b"), %%mm2 \n\t" | |
2331 "punpcklbw %%mm7, %%mm4 \n\t" | |
2332 "punpcklbw %%mm7, %%mm2 \n\t" | |
2333 "pmaddwd %%mm1, %%mm4 \n\t" | |
2334 "pmaddwd %%mm2, %%mm5 \n\t" | |
2335 "paddd %%mm4, %%mm0 \n\t" | |
2336 "paddd %%mm5, %%mm3 \n\t" | |
2337 | |
2338 "psrad $8, %%mm0 \n\t" | |
2339 "psrad $8, %%mm3 \n\t" | |
2340 "packssdw %%mm3, %%mm0 \n\t" | |
2341 "pmaddwd %%mm6, %%mm0 \n\t" | |
2342 "packssdw %%mm0, %%mm0 \n\t" | |
2343 "movd %%mm0, (%4, %%"REG_BP") \n\t" | |
2344 "add $4, %%"REG_BP" \n\t" | |
2345 " jnc 1b \n\t" | |
2346 | |
2347 "pop %%"REG_BP" \n\t" | |
19396 | 2348 #if defined(PIC) |
2349 "pop %%"REG_b" \n\t" | |
2350 #endif | |
18861 | 2351 : "+a" (counter) |
2352 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
19396 | 2353 #if !defined(PIC) |
18861 | 2354 : "%"REG_b |
19396 | 2355 #endif |
18861 | 2356 ); |
2357 } | |
2358 else | |
2359 { | |
2360 uint8_t *offset = src+filterSize; | |
2361 long counter= -2*dstW; | |
2362 // filter-= counter*filterSize/2; | |
2363 filterPos-= counter/2; | |
2364 dst-= counter/2; | |
2365 asm volatile( | |
2366 "pxor %%mm7, %%mm7 \n\t" | |
2367 "movq "MANGLE(w02)", %%mm6 \n\t" | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
2368 ASMALIGN(4) |
18861 | 2369 "1: \n\t" |
2370 "mov %2, %%"REG_c" \n\t" | |
2371 "movzwl (%%"REG_c", %0), %%eax \n\t" | |
19396 | 2372 "movzwl 2(%%"REG_c", %0), %%edx \n\t" |
18861 | 2373 "mov %5, %%"REG_c" \n\t" |
2374 "pxor %%mm4, %%mm4 \n\t" | |
2375 "pxor %%mm5, %%mm5 \n\t" | |
2376 "2: \n\t" | |
2377 "movq (%1), %%mm1 \n\t" | |
2378 "movq (%1, %6), %%mm3 \n\t" | |
2379 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t" | |
19396 | 2380 "movd (%%"REG_c", %%"REG_d"), %%mm2\n\t" |
18861 | 2381 "punpcklbw %%mm7, %%mm0 \n\t" |
2382 "punpcklbw %%mm7, %%mm2 \n\t" | |
2383 "pmaddwd %%mm1, %%mm0 \n\t" | |
2384 "pmaddwd %%mm2, %%mm3 \n\t" | |
2385 "paddd %%mm3, %%mm5 \n\t" | |
2386 "paddd %%mm0, %%mm4 \n\t" | |
2387 "add $8, %1 \n\t" | |
2388 "add $4, %%"REG_c" \n\t" | |
2389 "cmp %4, %%"REG_c" \n\t" | |
2390 " jb 2b \n\t" | |
2391 "add %6, %1 \n\t" | |
2392 "psrad $8, %%mm4 \n\t" | |
2393 "psrad $8, %%mm5 \n\t" | |
2394 "packssdw %%mm5, %%mm4 \n\t" | |
2395 "pmaddwd %%mm6, %%mm4 \n\t" | |
2396 "packssdw %%mm4, %%mm4 \n\t" | |
2397 "mov %3, %%"REG_a" \n\t" | |
2398 "movd %%mm4, (%%"REG_a", %0) \n\t" | |
2399 "add $4, %0 \n\t" | |
2400 " jnc 1b \n\t" | |
2401 | |
2402 : "+r" (counter), "+r" (filter) | |
2403 : "m" (filterPos), "m" (dst), "m"(offset), | |
2404 "m" (src), "r" (filterSize*2) | |
19396 | 2405 : "%"REG_a, "%"REG_c, "%"REG_d |
18861 | 2406 ); |
2407 } | |
2408 #else | |
2409 #ifdef HAVE_ALTIVEC | |
2410 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize); | |
2411 #else | |
2412 int i; | |
2413 for(i=0; i<dstW; i++) | |
2414 { | |
2415 int j; | |
2416 int srcPos= filterPos[i]; | |
2417 int val=0; | |
2418 // printf("filterPos: %d\n", filterPos[i]); | |
2419 for(j=0; j<filterSize; j++) | |
2420 { | |
2421 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); | |
2422 val += ((int)src[srcPos + j])*filter[filterSize*i + j]; | |
2423 } | |
2424 // filter += hFilterSize; | |
19181 | 2425 dst[i] = FFMIN(FFMAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ... |
18861 | 2426 // dst[i] = val>>7; |
2427 } | |
2428 #endif | |
2429 #endif | |
2430 } | |
2431 // *** horizontal scale Y line to temp buffer | |
2432 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc, | |
2433 int flags, int canMMX2BeUsed, int16_t *hLumFilter, | |
2434 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, | |
2435 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, | |
2436 int32_t *mmx2FilterPos) | |
2437 { | |
2438 if(srcFormat==IMGFMT_YUY2) | |
2439 { | |
2440 RENAME(yuy2ToY)(formatConvBuffer, src, srcW); | |
2441 src= formatConvBuffer; | |
2442 } | |
2443 else if(srcFormat==IMGFMT_UYVY) | |
2444 { | |
2445 RENAME(uyvyToY)(formatConvBuffer, src, srcW); | |
2446 src= formatConvBuffer; | |
2447 } | |
2448 else if(srcFormat==IMGFMT_BGR32) | |
2449 { | |
2450 RENAME(bgr32ToY)(formatConvBuffer, src, srcW); | |
2451 src= formatConvBuffer; | |
2452 } | |
2453 else if(srcFormat==IMGFMT_BGR24) | |
2454 { | |
2455 RENAME(bgr24ToY)(formatConvBuffer, src, srcW); | |
2456 src= formatConvBuffer; | |
2457 } | |
2458 else if(srcFormat==IMGFMT_BGR16) | |
2459 { | |
2460 RENAME(bgr16ToY)(formatConvBuffer, src, srcW); | |
2461 src= formatConvBuffer; | |
2462 } | |
2463 else if(srcFormat==IMGFMT_BGR15) | |
2464 { | |
2465 RENAME(bgr15ToY)(formatConvBuffer, src, srcW); | |
2466 src= formatConvBuffer; | |
2467 } | |
2468 else if(srcFormat==IMGFMT_RGB32) | |
2469 { | |
2470 RENAME(rgb32ToY)(formatConvBuffer, src, srcW); | |
2471 src= formatConvBuffer; | |
2472 } | |
2473 else if(srcFormat==IMGFMT_RGB24) | |
2474 { | |
2475 RENAME(rgb24ToY)(formatConvBuffer, src, srcW); | |
2476 src= formatConvBuffer; | |
2477 } | |
2478 | |
2479 #ifdef HAVE_MMX | |
2480 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one) | |
2481 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) | |
2482 #else | |
2483 if(!(flags&SWS_FAST_BILINEAR)) | |
2484 #endif | |
2485 { | |
2486 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); | |
2487 } | |
2488 else // Fast Bilinear upscale / crap downscale | |
2489 { | |
2490 #if defined(ARCH_X86) || defined(ARCH_X86_64) | |
2491 #ifdef HAVE_MMX2 | |
2492 int i; | |
19396 | 2493 #if defined(PIC) |
2494 uint64_t ebxsave __attribute__((aligned(8))); | |
2495 #endif | |
18861 | 2496 if(canMMX2BeUsed) |
2497 { | |
2498 asm volatile( | |
19396 | 2499 #if defined(PIC) |
2500 "mov %%"REG_b", %5 \n\t" | |
2501 #endif | |
18861 | 2502 "pxor %%mm7, %%mm7 \n\t" |
2503 "mov %0, %%"REG_c" \n\t" | |
2504 "mov %1, %%"REG_D" \n\t" | |
2505 "mov %2, %%"REG_d" \n\t" | |
2506 "mov %3, %%"REG_b" \n\t" | |
2507 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2508 PREFETCH" (%%"REG_c") \n\t" | |
2509 PREFETCH" 32(%%"REG_c") \n\t" | |
2510 PREFETCH" 64(%%"REG_c") \n\t" | |
2511 | |
2512 #ifdef ARCH_X86_64 | |
2513 | |
2514 #define FUNNY_Y_CODE \ | |
2515 "movl (%%"REG_b"), %%esi \n\t"\ | |
2516 "call *%4 \n\t"\ | |
2517 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\ | |
2518 "add %%"REG_S", %%"REG_c" \n\t"\ | |
2519 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2520 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2521 | |
2522 #else | |
2523 | |
2524 #define FUNNY_Y_CODE \ | |
2525 "movl (%%"REG_b"), %%esi \n\t"\ | |
2526 "call *%4 \n\t"\ | |
2527 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\ | |
2528 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2529 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2530 | |
2531 #endif | |
2532 | |
2533 FUNNY_Y_CODE | |
2534 FUNNY_Y_CODE | |
2535 FUNNY_Y_CODE | |
2536 FUNNY_Y_CODE | |
2537 FUNNY_Y_CODE | |
2538 FUNNY_Y_CODE | |
2539 FUNNY_Y_CODE | |
2540 FUNNY_Y_CODE | |
2541 | |
19396 | 2542 #if defined(PIC) |
2543 "mov %5, %%"REG_b" \n\t" | |
2544 #endif | |
18861 | 2545 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2546 "m" (funnyYCode) | |
19396 | 2547 #if defined(PIC) |
2548 ,"m" (ebxsave) | |
2549 #endif | |
2550 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D | |
2551 #if !defined(PIC) | |
2552 ,"%"REG_b | |
2553 #endif | |
18861 | 2554 ); |
2555 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; | |
2556 } | |
2557 else | |
2558 { | |
2559 #endif | |
2560 long xInc_shr16 = xInc >> 16; | |
2561 uint16_t xInc_mask = xInc & 0xffff; | |
2562 //NO MMX just normal asm ... | |
2563 asm volatile( | |
2564 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
19396 | 2565 "xor %%"REG_d", %%"REG_d" \n\t" // xx |
18861 | 2566 "xorl %%ecx, %%ecx \n\t" // 2*xalpha |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
2567 ASMALIGN(4) |
18861 | 2568 "1: \n\t" |
19396 | 2569 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] |
2570 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
18861 | 2571 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2572 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2573 "shll $16, %%edi \n\t" | |
2574 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2575 "mov %1, %%"REG_D" \n\t" | |
2576 "shrl $9, %%esi \n\t" | |
2577 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t" | |
2578 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
19396 | 2579 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry |
18861 | 2580 |
19396 | 2581 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] |
2582 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
18861 | 2583 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2584 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2585 "shll $16, %%edi \n\t" | |
2586 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2587 "mov %1, %%"REG_D" \n\t" | |
2588 "shrl $9, %%esi \n\t" | |
2589 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t" | |
2590 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
19396 | 2591 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry |
18861 | 2592 |
2593 | |
2594 "add $2, %%"REG_a" \n\t" | |
2595 "cmp %2, %%"REG_a" \n\t" | |
2596 " jb 1b \n\t" | |
2597 | |
2598 | |
2599 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask) | |
19396 | 2600 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" |
18861 | 2601 ); |
2602 #ifdef HAVE_MMX2 | |
2603 } //if MMX2 can't be used | |
2604 #endif | |
2605 #else | |
2606 int i; | |
2607 unsigned int xpos=0; | |
2608 for(i=0;i<dstWidth;i++) | |
2609 { | |
2610 register unsigned int xx=xpos>>16; | |
2611 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2612 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; | |
2613 xpos+=xInc; | |
2614 } | |
2615 #endif | |
2616 } | |
2617 } | |
2618 | |
2619 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2, | |
2620 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, | |
2621 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, | |
2622 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, | |
2623 int32_t *mmx2FilterPos) | |
2624 { | |
2625 if(srcFormat==IMGFMT_YUY2) | |
2626 { | |
2627 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2628 src1= formatConvBuffer; | |
2629 src2= formatConvBuffer+2048; | |
2630 } | |
2631 else if(srcFormat==IMGFMT_UYVY) | |
2632 { | |
2633 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2634 src1= formatConvBuffer; | |
2635 src2= formatConvBuffer+2048; | |
2636 } | |
2637 else if(srcFormat==IMGFMT_BGR32) | |
2638 { | |
2639 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2640 src1= formatConvBuffer; | |
2641 src2= formatConvBuffer+2048; | |
2642 } | |
2643 else if(srcFormat==IMGFMT_BGR24) | |
2644 { | |
2645 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2646 src1= formatConvBuffer; | |
2647 src2= formatConvBuffer+2048; | |
2648 } | |
2649 else if(srcFormat==IMGFMT_BGR16) | |
2650 { | |
2651 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2652 src1= formatConvBuffer; | |
2653 src2= formatConvBuffer+2048; | |
2654 } | |
2655 else if(srcFormat==IMGFMT_BGR15) | |
2656 { | |
2657 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2658 src1= formatConvBuffer; | |
2659 src2= formatConvBuffer+2048; | |
2660 } | |
2661 else if(srcFormat==IMGFMT_RGB32) | |
2662 { | |
2663 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2664 src1= formatConvBuffer; | |
2665 src2= formatConvBuffer+2048; | |
2666 } | |
2667 else if(srcFormat==IMGFMT_RGB24) | |
2668 { | |
2669 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2670 src1= formatConvBuffer; | |
2671 src2= formatConvBuffer+2048; | |
2672 } | |
2673 else if(isGray(srcFormat)) | |
2674 { | |
2675 return; | |
2676 } | |
2677 | |
2678 #ifdef HAVE_MMX | |
2679 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one) | |
2680 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) | |
2681 #else | |
2682 if(!(flags&SWS_FAST_BILINEAR)) | |
2683 #endif | |
2684 { | |
2685 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2686 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2687 } | |
2688 else // Fast Bilinear upscale / crap downscale | |
2689 { | |
2690 #if defined(ARCH_X86) || defined(ARCH_X86_64) | |
2691 #ifdef HAVE_MMX2 | |
2692 int i; | |
19396 | 2693 #if defined(PIC) |
2694 uint64_t ebxsave __attribute__((aligned(8))); | |
2695 #endif | |
18861 | 2696 if(canMMX2BeUsed) |
2697 { | |
2698 asm volatile( | |
19396 | 2699 #if defined(PIC) |
2700 "mov %%"REG_b", %6 \n\t" | |
2701 #endif | |
18861 | 2702 "pxor %%mm7, %%mm7 \n\t" |
2703 "mov %0, %%"REG_c" \n\t" | |
2704 "mov %1, %%"REG_D" \n\t" | |
2705 "mov %2, %%"REG_d" \n\t" | |
2706 "mov %3, %%"REG_b" \n\t" | |
2707 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2708 PREFETCH" (%%"REG_c") \n\t" | |
2709 PREFETCH" 32(%%"REG_c") \n\t" | |
2710 PREFETCH" 64(%%"REG_c") \n\t" | |
2711 | |
2712 #ifdef ARCH_X86_64 | |
2713 | |
2714 #define FUNNY_UV_CODE \ | |
2715 "movl (%%"REG_b"), %%esi \n\t"\ | |
2716 "call *%4 \n\t"\ | |
2717 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\ | |
2718 "add %%"REG_S", %%"REG_c" \n\t"\ | |
2719 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2720 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2721 | |
2722 #else | |
2723 | |
2724 #define FUNNY_UV_CODE \ | |
2725 "movl (%%"REG_b"), %%esi \n\t"\ | |
2726 "call *%4 \n\t"\ | |
2727 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\ | |
2728 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2729 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2730 | |
2731 #endif | |
2732 | |
2733 FUNNY_UV_CODE | |
2734 FUNNY_UV_CODE | |
2735 FUNNY_UV_CODE | |
2736 FUNNY_UV_CODE | |
2737 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2738 "mov %5, %%"REG_c" \n\t" // src | |
2739 "mov %1, %%"REG_D" \n\t" // buf1 | |
2740 "add $4096, %%"REG_D" \n\t" | |
2741 PREFETCH" (%%"REG_c") \n\t" | |
2742 PREFETCH" 32(%%"REG_c") \n\t" | |
2743 PREFETCH" 64(%%"REG_c") \n\t" | |
2744 | |
2745 FUNNY_UV_CODE | |
2746 FUNNY_UV_CODE | |
2747 FUNNY_UV_CODE | |
2748 FUNNY_UV_CODE | |
2749 | |
19396 | 2750 #if defined(PIC) |
2751 "mov %6, %%"REG_b" \n\t" | |
2752 #endif | |
18861 | 2753 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2754 "m" (funnyUVCode), "m" (src2) | |
19396 | 2755 #if defined(PIC) |
2756 ,"m" (ebxsave) | |
2757 #endif | |
19400
0310c3310360
Fix compilation with -no-PIC and without -fomit-frame-pointer (used by
uau
parents:
19396
diff
changeset
|
2758 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D |
19396 | 2759 #if !defined(PIC) |
2760 ,"%"REG_b | |
2761 #endif | |
18861 | 2762 ); |
2763 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) | |
2764 { | |
2765 // printf("%d %d %d\n", dstWidth, i, srcW); | |
2766 dst[i] = src1[srcW-1]*128; | |
2767 dst[i+2048] = src2[srcW-1]*128; | |
2768 } | |
2769 } | |
2770 else | |
2771 { | |
2772 #endif | |
2773 long xInc_shr16 = (long) (xInc >> 16); | |
2774 uint16_t xInc_mask = xInc & 0xffff; | |
2775 asm volatile( | |
2776 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
19396 | 2777 "xor %%"REG_d", %%"REG_d" \n\t" // xx |
18861 | 2778 "xorl %%ecx, %%ecx \n\t" // 2*xalpha |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
2779 ASMALIGN(4) |
18861 | 2780 "1: \n\t" |
2781 "mov %0, %%"REG_S" \n\t" | |
19396 | 2782 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx] |
2783 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1] | |
18861 | 2784 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2785 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2786 "shll $16, %%edi \n\t" | |
2787 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2788 "mov %1, %%"REG_D" \n\t" | |
2789 "shrl $9, %%esi \n\t" | |
2790 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t" | |
2791 | |
19396 | 2792 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx] |
2793 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
18861 | 2794 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2795 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2796 "shll $16, %%edi \n\t" | |
2797 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2798 "mov %1, %%"REG_D" \n\t" | |
2799 "shrl $9, %%esi \n\t" | |
2800 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t" | |
2801 | |
2802 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
19396 | 2803 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry |
18861 | 2804 "add $1, %%"REG_a" \n\t" |
2805 "cmp %2, %%"REG_a" \n\t" | |
2806 " jb 1b \n\t" | |
2807 | |
2808 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, | |
2809 which is needed to support GCC-4.0 */ | |
2810 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4)) | |
2811 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), | |
2812 #else | |
2813 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), | |
2814 #endif | |
2815 "r" (src2) | |
19396 | 2816 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" |
18861 | 2817 ); |
2818 #ifdef HAVE_MMX2 | |
2819 } //if MMX2 can't be used | |
2820 #endif | |
2821 #else | |
2822 int i; | |
2823 unsigned int xpos=0; | |
2824 for(i=0;i<dstWidth;i++) | |
2825 { | |
2826 register unsigned int xx=xpos>>16; | |
2827 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2828 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |
2829 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | |
2830 /* slower | |
2831 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; | |
2832 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; | |
2833 */ | |
2834 xpos+=xInc; | |
2835 } | |
2836 #endif | |
2837 } | |
2838 } | |
2839 | |
2840 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | |
2841 int srcSliceH, uint8_t* dst[], int dstStride[]){ | |
2842 | |
2843 /* load a few things into local vars to make the code more readable? and faster */ | |
2844 const int srcW= c->srcW; | |
2845 const int dstW= c->dstW; | |
2846 const int dstH= c->dstH; | |
2847 const int chrDstW= c->chrDstW; | |
2848 const int chrSrcW= c->chrSrcW; | |
2849 const int lumXInc= c->lumXInc; | |
2850 const int chrXInc= c->chrXInc; | |
2851 const int dstFormat= c->dstFormat; | |
2852 const int srcFormat= c->srcFormat; | |
2853 const int flags= c->flags; | |
2854 const int canMMX2BeUsed= c->canMMX2BeUsed; | |
2855 int16_t *vLumFilterPos= c->vLumFilterPos; | |
2856 int16_t *vChrFilterPos= c->vChrFilterPos; | |
2857 int16_t *hLumFilterPos= c->hLumFilterPos; | |
2858 int16_t *hChrFilterPos= c->hChrFilterPos; | |
2859 int16_t *vLumFilter= c->vLumFilter; | |
2860 int16_t *vChrFilter= c->vChrFilter; | |
2861 int16_t *hLumFilter= c->hLumFilter; | |
2862 int16_t *hChrFilter= c->hChrFilter; | |
2863 int32_t *lumMmxFilter= c->lumMmxFilter; | |
2864 int32_t *chrMmxFilter= c->chrMmxFilter; | |
2865 const int vLumFilterSize= c->vLumFilterSize; | |
2866 const int vChrFilterSize= c->vChrFilterSize; | |
2867 const int hLumFilterSize= c->hLumFilterSize; | |
2868 const int hChrFilterSize= c->hChrFilterSize; | |
2869 int16_t **lumPixBuf= c->lumPixBuf; | |
2870 int16_t **chrPixBuf= c->chrPixBuf; | |
2871 const int vLumBufSize= c->vLumBufSize; | |
2872 const int vChrBufSize= c->vChrBufSize; | |
2873 uint8_t *funnyYCode= c->funnyYCode; | |
2874 uint8_t *funnyUVCode= c->funnyUVCode; | |
2875 uint8_t *formatConvBuffer= c->formatConvBuffer; | |
2876 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample; | |
2877 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample); | |
2878 int lastDstY; | |
2879 | |
2880 /* vars whch will change and which we need to storw back in the context */ | |
2881 int dstY= c->dstY; | |
2882 int lumBufIndex= c->lumBufIndex; | |
2883 int chrBufIndex= c->chrBufIndex; | |
2884 int lastInLumBuf= c->lastInLumBuf; | |
2885 int lastInChrBuf= c->lastInChrBuf; | |
2886 | |
2887 if(isPacked(c->srcFormat)){ | |
2888 src[0]= | |
2889 src[1]= | |
2890 src[2]= src[0]; | |
2891 srcStride[0]= | |
2892 srcStride[1]= | |
2893 srcStride[2]= srcStride[0]; | |
2894 } | |
2895 srcStride[1]<<= c->vChrDrop; | |
2896 srcStride[2]<<= c->vChrDrop; | |
2897 | |
2898 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2], | |
2899 // (int)dst[0], (int)dst[1], (int)dst[2]); | |
2900 | |
2901 #if 0 //self test FIXME move to a vfilter or something | |
2902 { | |
2903 static volatile int i=0; | |
2904 i++; | |
2905 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH) | |
2906 selfTest(src, srcStride, c->srcW, c->srcH); | |
2907 i--; | |
2908 } | |
2909 #endif | |
2910 | |
2911 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2], | |
2912 //dstStride[0],dstStride[1],dstStride[2]); | |
2913 | |
2914 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0) | |
2915 { | |
2916 static int firstTime=1; //FIXME move this into the context perhaps | |
2917 if(flags & SWS_PRINT_INFO && firstTime) | |
2918 { | |
2919 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n" | |
2920 "SwScaler: ->cannot do aligned memory acesses anymore\n"); | |
2921 firstTime=0; | |
2922 } | |
2923 } | |
2924 | |
2925 /* Note the user might start scaling the picture in the middle so this will not get executed | |
2926 this is not really intended but works currently, so ppl might do it */ | |
2927 if(srcSliceY ==0){ | |
2928 lumBufIndex=0; | |
2929 chrBufIndex=0; | |
2930 dstY=0; | |
2931 lastInLumBuf= -1; | |
2932 lastInChrBuf= -1; | |
2933 } | |
2934 | |
2935 lastDstY= dstY; | |
2936 | |
2937 for(;dstY < dstH; dstY++){ | |
2938 unsigned char *dest =dst[0]+dstStride[0]*dstY; | |
2939 const int chrDstY= dstY>>c->chrDstVSubSample; | |
2940 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY; | |
2941 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY; | |
2942 | |
2943 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input | |
2944 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input | |
2945 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input | |
2946 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input | |
2947 | |
2948 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n", | |
2949 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample); | |
2950 //handle holes (FAST_BILINEAR & weird filters) | |
2951 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; | |
2952 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; | |
2953 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize); | |
2954 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1) | |
2955 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1) | |
2956 | |
2957 // Do we have enough lines in this slice to output the dstY line | |
2958 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample)) | |
2959 { | |
2960 //Do horizontal scaling | |
2961 while(lastInLumBuf < lastLumSrcY) | |
2962 { | |
2963 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; | |
2964 lumBufIndex++; | |
2965 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY); | |
2966 ASSERT(lumBufIndex < 2*vLumBufSize) | |
2967 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
2968 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
2969 // printf("%d %d\n", lumBufIndex, vLumBufSize); | |
2970 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, | |
2971 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, | |
2972 funnyYCode, c->srcFormat, formatConvBuffer, | |
2973 c->lumMmx2Filter, c->lumMmx2FilterPos); | |
2974 lastInLumBuf++; | |
2975 } | |
2976 while(lastInChrBuf < lastChrSrcY) | |
2977 { | |
2978 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; | |
2979 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; | |
2980 chrBufIndex++; | |
2981 ASSERT(chrBufIndex < 2*vChrBufSize) | |
2982 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)) | |
2983 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0) | |
2984 //FIXME replace parameters through context struct (some at least) | |
2985 | |
2986 if(!(isGray(srcFormat) || isGray(dstFormat))) | |
2987 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, | |
2988 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, | |
2989 funnyUVCode, c->srcFormat, formatConvBuffer, | |
2990 c->chrMmx2Filter, c->chrMmx2FilterPos); | |
2991 lastInChrBuf++; | |
2992 } | |
2993 //wrap buf index around to stay inside the ring buffer | |
2994 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
2995 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
2996 } | |
2997 else // not enough lines left in this slice -> load the rest in the buffer | |
2998 { | |
2999 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n", | |
3000 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY, | |
3001 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize, | |
3002 vChrBufSize, vLumBufSize);*/ | |
3003 | |
3004 //Do horizontal scaling | |
3005 while(lastInLumBuf+1 < srcSliceY + srcSliceH) | |
3006 { | |
3007 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; | |
3008 lumBufIndex++; | |
3009 ASSERT(lumBufIndex < 2*vLumBufSize) | |
3010 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
3011 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
3012 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, | |
3013 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, | |
3014 funnyYCode, c->srcFormat, formatConvBuffer, | |
3015 c->lumMmx2Filter, c->lumMmx2FilterPos); | |
3016 lastInLumBuf++; | |
3017 } | |
3018 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH)) | |
3019 { | |
3020 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; | |
3021 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; | |
3022 chrBufIndex++; | |
3023 ASSERT(chrBufIndex < 2*vChrBufSize) | |
3024 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH) | |
3025 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0) | |
3026 | |
3027 if(!(isGray(srcFormat) || isGray(dstFormat))) | |
3028 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, | |
3029 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, | |
3030 funnyUVCode, c->srcFormat, formatConvBuffer, | |
3031 c->chrMmx2Filter, c->chrMmx2FilterPos); | |
3032 lastInChrBuf++; | |
3033 } | |
3034 //wrap buf index around to stay inside the ring buffer | |
3035 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
3036 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
3037 break; //we can't output a dstY line so let's try with the next slice | |
3038 } | |
3039 | |
3040 #ifdef HAVE_MMX | |
3041 b5Dither= dither8[dstY&1]; | |
3042 g6Dither= dither4[dstY&1]; | |
3043 g5Dither= dither8[dstY&1]; | |
3044 r5Dither= dither8[(dstY+1)&1]; | |
3045 #endif | |
3046 if(dstY < dstH-2) | |
3047 { | |
3048 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
3049 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
3050 #ifdef HAVE_MMX | |
3051 int i; | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3052 if(flags & SWS_ACCURATE_RND){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3053 for(i=0; i<vLumFilterSize; i+=2){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3054 lumMmxFilter[2*i+0]= lumSrcPtr[i ]; |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3055 lumMmxFilter[2*i+1]= lumSrcPtr[i+(vLumFilterSize>1)]; |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3056 lumMmxFilter[2*i+2]= |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3057 lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i ] |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3058 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0); |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3059 } |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3060 for(i=0; i<vChrFilterSize; i+=2){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3061 chrMmxFilter[2*i+0]= chrSrcPtr[i ]; |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3062 chrMmxFilter[2*i+1]= chrSrcPtr[i+(vChrFilterSize>1)]; |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3063 chrMmxFilter[2*i+2]= |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3064 chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i ] |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3065 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0); |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3066 } |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3067 }else{ |
18861 | 3068 for(i=0; i<vLumFilterSize; i++) |
3069 { | |
3070 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i]; | |
3071 lumMmxFilter[4*i+2]= | |
3072 lumMmxFilter[4*i+3]= | |
3073 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; | |
3074 } | |
3075 for(i=0; i<vChrFilterSize; i++) | |
3076 { | |
3077 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i]; | |
3078 chrMmxFilter[4*i+2]= | |
3079 chrMmxFilter[4*i+3]= | |
3080 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; | |
3081 } | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3082 } |
18861 | 3083 #endif |
3084 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){ | |
3085 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
3086 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi | |
3087 RENAME(yuv2nv12X)(c, | |
3088 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
3089 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3090 dest, uDest, dstW, chrDstW, dstFormat); | |
3091 } | |
3092 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like | |
3093 { | |
3094 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
3095 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
3096 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12 | |
3097 { | |
3098 int16_t *lumBuf = lumPixBuf[0]; | |
3099 int16_t *chrBuf= chrPixBuf[0]; | |
3100 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW); | |
3101 } | |
3102 else //General YV12 | |
3103 { | |
3104 RENAME(yuv2yuvX)(c, | |
3105 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
3106 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3107 dest, uDest, vDest, dstW, chrDstW); | |
3108 } | |
3109 } | |
3110 else | |
3111 { | |
3112 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
3113 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
3114 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB | |
3115 { | |
3116 int chrAlpha= vChrFilter[2*dstY+1]; | |
3117 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), | |
3118 dest, dstW, chrAlpha, dstFormat, flags, dstY); | |
3119 } | |
3120 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB | |
3121 { | |
3122 int lumAlpha= vLumFilter[2*dstY+1]; | |
3123 int chrAlpha= vChrFilter[2*dstY+1]; | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3124 lumMmxFilter[2]= |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3125 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001; |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3126 chrMmxFilter[2]= |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3127 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001; |
18861 | 3128 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), |
3129 dest, dstW, lumAlpha, chrAlpha, dstY); | |
3130 } | |
3131 else //General RGB | |
3132 { | |
3133 RENAME(yuv2packedX)(c, | |
3134 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
3135 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3136 dest, dstW, dstY); | |
3137 } | |
3138 } | |
3139 } | |
3140 else // hmm looks like we can't use MMX here without overwriting this array's tail | |
3141 { | |
3142 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
3143 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
3144 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){ | |
3145 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
3146 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi | |
3147 yuv2nv12XinC( | |
3148 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
3149 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3150 dest, uDest, dstW, chrDstW, dstFormat); | |
3151 } | |
3152 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 | |
3153 { | |
3154 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
3155 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
3156 yuv2yuvXinC( | |
3157 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
3158 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3159 dest, uDest, vDest, dstW, chrDstW); | |
3160 } | |
3161 else | |
3162 { | |
3163 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
3164 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
3165 yuv2packedXinC(c, | |
3166 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
3167 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3168 dest, dstW, dstY); | |
3169 } | |
3170 } | |
3171 } | |
3172 | |
3173 #ifdef HAVE_MMX | |
3174 __asm __volatile(SFENCE:::"memory"); | |
3175 __asm __volatile(EMMS:::"memory"); | |
3176 #endif | |
3177 /* store changed local vars back in the context */ | |
3178 c->dstY= dstY; | |
3179 c->lumBufIndex= lumBufIndex; | |
3180 c->chrBufIndex= chrBufIndex; | |
3181 c->lastInLumBuf= lastInLumBuf; | |
3182 c->lastInChrBuf= lastInChrBuf; | |
3183 | |
3184 return dstY - lastDstY; | |
3185 } |