Mercurial > mplayer.hg
annotate libswscale/swscale_template.c @ 19778:b7bba0853adf
KP7 bound to dvdnav 7 (previous menu)
author | nicodvb |
---|---|
date | Sun, 10 Sep 2006 10:37:19 +0000 |
parents | 4678e9f81334 |
children | 8e50cba9fe03 |
rev | line source |
---|---|
18861 | 1 /* |
2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> | |
3 | |
4 This program is free software; you can redistribute it and/or modify | |
5 it under the terms of the GNU General Public License as published by | |
6 the Free Software Foundation; either version 2 of the License, or | |
7 (at your option) any later version. | |
8 | |
9 This program is distributed in the hope that it will be useful, | |
10 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 GNU General Public License for more details. | |
13 | |
14 You should have received a copy of the GNU General Public License | |
15 along with this program; if not, write to the Free Software | |
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | |
19594
4678e9f81334
make the C code of the swscaler which i wrote LGPL
michael
parents:
19400
diff
changeset
|
17 |
4678e9f81334
make the C code of the swscaler which i wrote LGPL
michael
parents:
19400
diff
changeset
|
18 the C code (not assembly, mmx, ...) of the swscaler which has been written |
4678e9f81334
make the C code of the swscaler which i wrote LGPL
michael
parents:
19400
diff
changeset
|
19 by Michael Niedermayer can be used under the LGPL license too |
18861 | 20 */ |
21 | |
22 #undef REAL_MOVNTQ | |
23 #undef MOVNTQ | |
24 #undef PAVGB | |
25 #undef PREFETCH | |
26 #undef PREFETCHW | |
27 #undef EMMS | |
28 #undef SFENCE | |
29 | |
30 #ifdef HAVE_3DNOW | |
31 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
32 #define EMMS "femms" | |
33 #else | |
34 #define EMMS "emms" | |
35 #endif | |
36 | |
37 #ifdef HAVE_3DNOW | |
38 #define PREFETCH "prefetch" | |
39 #define PREFETCHW "prefetchw" | |
40 #elif defined ( HAVE_MMX2 ) | |
41 #define PREFETCH "prefetchnta" | |
42 #define PREFETCHW "prefetcht0" | |
43 #else | |
44 #define PREFETCH "/nop" | |
45 #define PREFETCHW "/nop" | |
46 #endif | |
47 | |
48 #ifdef HAVE_MMX2 | |
49 #define SFENCE "sfence" | |
50 #else | |
51 #define SFENCE "/nop" | |
52 #endif | |
53 | |
54 #ifdef HAVE_MMX2 | |
55 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | |
56 #elif defined (HAVE_3DNOW) | |
57 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | |
58 #endif | |
59 | |
60 #ifdef HAVE_MMX2 | |
61 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" | |
62 #else | |
63 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" | |
64 #endif | |
65 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) | |
66 | |
67 #ifdef HAVE_ALTIVEC | |
68 #include "swscale_altivec_template.c" | |
69 #endif | |
70 | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
71 #define YSCALEYUV2YV12X(x, offset, dest, width) \ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
72 asm volatile(\ |
18861 | 73 "xor %%"REG_a", %%"REG_a" \n\t"\ |
74 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ | |
75 "movq %%mm3, %%mm4 \n\t"\ | |
76 "lea " offset "(%0), %%"REG_d" \n\t"\ | |
77 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
78 ASMALIGN(4) /* FIXME Unroll? */\ |
18861 | 79 "1: \n\t"\ |
80 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
81 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\ | |
82 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\ | |
83 "add $16, %%"REG_d" \n\t"\ | |
84 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
85 "test %%"REG_S", %%"REG_S" \n\t"\ | |
86 "pmulhw %%mm0, %%mm2 \n\t"\ | |
87 "pmulhw %%mm0, %%mm5 \n\t"\ | |
88 "paddw %%mm2, %%mm3 \n\t"\ | |
89 "paddw %%mm5, %%mm4 \n\t"\ | |
90 " jnz 1b \n\t"\ | |
91 "psraw $3, %%mm3 \n\t"\ | |
92 "psraw $3, %%mm4 \n\t"\ | |
93 "packuswb %%mm4, %%mm3 \n\t"\ | |
94 MOVNTQ(%%mm3, (%1, %%REGa))\ | |
95 "add $8, %%"REG_a" \n\t"\ | |
96 "cmp %2, %%"REG_a" \n\t"\ | |
97 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ | |
98 "movq %%mm3, %%mm4 \n\t"\ | |
99 "lea " offset "(%0), %%"REG_d" \n\t"\ | |
100 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
101 "jb 1b \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
102 :: "r" (&c->redDither),\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
103 "r" (dest), "p" (width)\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
104 : "%"REG_a, "%"REG_d, "%"REG_S\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
105 ); |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
106 |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
107 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
108 asm volatile(\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
109 "lea " offset "(%0), %%"REG_d" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
110 "xor %%"REG_a", %%"REG_a" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
111 "pxor %%mm4, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
112 "pxor %%mm5, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
113 "pxor %%mm6, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
114 "pxor %%mm7, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
115 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
116 ASMALIGN(4) \ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
117 "1: \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
118 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0\n\t" /* srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
119 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
120 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
121 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1\n\t" /* srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
122 "movq %%mm0, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
123 "punpcklwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
124 "punpckhwd %%mm1, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
125 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
126 "pmaddwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
127 "pmaddwd %%mm1, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
128 "paddd %%mm0, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
129 "paddd %%mm3, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
130 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3\n\t" /* srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
131 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
132 "add $16, %%"REG_d" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
133 "test %%"REG_S", %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
134 "movq %%mm2, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
135 "punpcklwd %%mm3, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
136 "punpckhwd %%mm3, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
137 "pmaddwd %%mm1, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
138 "pmaddwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
139 "paddd %%mm2, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
140 "paddd %%mm0, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
141 " jnz 1b \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
142 "psrad $16, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
143 "psrad $16, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
144 "psrad $16, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
145 "psrad $16, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
146 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
147 "packssdw %%mm5, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
148 "packssdw %%mm7, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
149 "paddw %%mm0, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
150 "paddw %%mm0, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
151 "psraw $3, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
152 "psraw $3, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
153 "packuswb %%mm6, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
154 MOVNTQ(%%mm4, (%1, %%REGa))\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
155 "add $8, %%"REG_a" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
156 "cmp %2, %%"REG_a" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
157 "lea " offset "(%0), %%"REG_d" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
158 "pxor %%mm4, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
159 "pxor %%mm5, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
160 "pxor %%mm6, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
161 "pxor %%mm7, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
162 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
163 "jb 1b \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
164 :: "r" (&c->redDither),\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
165 "r" (dest), "p" (width)\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
166 : "%"REG_a, "%"REG_d, "%"REG_S\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
167 ); |
18861 | 168 |
169 #define YSCALEYUV2YV121 \ | |
170 "mov %2, %%"REG_a" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
171 ASMALIGN(4) /* FIXME Unroll? */\ |
18861 | 172 "1: \n\t"\ |
173 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ | |
174 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\ | |
175 "psraw $7, %%mm0 \n\t"\ | |
176 "psraw $7, %%mm1 \n\t"\ | |
177 "packuswb %%mm1, %%mm0 \n\t"\ | |
178 MOVNTQ(%%mm0, (%1, %%REGa))\ | |
179 "add $8, %%"REG_a" \n\t"\ | |
180 "jnc 1b \n\t" | |
181 | |
182 /* | |
183 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
184 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
185 "r" (dest), "m" (dstW), | |
186 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
187 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
188 */ | |
189 #define YSCALEYUV2PACKEDX \ | |
19173 | 190 asm volatile(\ |
18861 | 191 "xor %%"REG_a", %%"REG_a" \n\t"\ |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
192 ASMALIGN(4)\ |
18861 | 193 "nop \n\t"\ |
194 "1: \n\t"\ | |
195 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ | |
196 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
197 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ | |
198 "movq %%mm3, %%mm4 \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
199 ASMALIGN(4)\ |
18861 | 200 "2: \n\t"\ |
201 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
202 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ | |
203 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ | |
204 "add $16, %%"REG_d" \n\t"\ | |
205 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
206 "pmulhw %%mm0, %%mm2 \n\t"\ | |
207 "pmulhw %%mm0, %%mm5 \n\t"\ | |
208 "paddw %%mm2, %%mm3 \n\t"\ | |
209 "paddw %%mm5, %%mm4 \n\t"\ | |
210 "test %%"REG_S", %%"REG_S" \n\t"\ | |
211 " jnz 2b \n\t"\ | |
212 \ | |
213 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ | |
214 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
215 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\ | |
216 "movq %%mm1, %%mm7 \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
217 ASMALIGN(4)\ |
18861 | 218 "2: \n\t"\ |
219 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
220 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\ | |
221 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\ | |
222 "add $16, %%"REG_d" \n\t"\ | |
223 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
224 "pmulhw %%mm0, %%mm2 \n\t"\ | |
225 "pmulhw %%mm0, %%mm5 \n\t"\ | |
226 "paddw %%mm2, %%mm1 \n\t"\ | |
227 "paddw %%mm5, %%mm7 \n\t"\ | |
228 "test %%"REG_S", %%"REG_S" \n\t"\ | |
229 " jnz 2b \n\t"\ | |
230 | |
19173 | 231 #define YSCALEYUV2PACKEDX_END\ |
232 :: "r" (&c->redDither), \ | |
233 "m" (dummy), "m" (dummy), "m" (dummy),\ | |
234 "r" (dest), "m" (dstW)\ | |
235 : "%"REG_a, "%"REG_d, "%"REG_S\ | |
236 ); | |
237 | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
238 #define YSCALEYUV2PACKEDX_ACCURATE \ |
19173 | 239 asm volatile(\ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
240 "xor %%"REG_a", %%"REG_a" \n\t"\ |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
241 ASMALIGN(4)\ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
242 "nop \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
243 "1: \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
244 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
245 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
246 "pxor %%mm4, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
247 "pxor %%mm5, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
248 "pxor %%mm6, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
249 "pxor %%mm7, %%mm7 \n\t"\ |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
250 ASMALIGN(4)\ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
251 "2: \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
252 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
253 "movq 4096(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
254 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
255 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
256 "movq %%mm0, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
257 "punpcklwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
258 "punpckhwd %%mm1, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
259 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
260 "pmaddwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
261 "pmaddwd %%mm1, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
262 "paddd %%mm0, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
263 "paddd %%mm3, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
264 "movq 4096(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
265 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
266 "add $16, %%"REG_d" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
267 "test %%"REG_S", %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
268 "movq %%mm2, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
269 "punpcklwd %%mm3, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
270 "punpckhwd %%mm3, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
271 "pmaddwd %%mm1, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
272 "pmaddwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
273 "paddd %%mm2, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
274 "paddd %%mm0, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
275 " jnz 2b \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
276 "psrad $16, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
277 "psrad $16, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
278 "psrad $16, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
279 "psrad $16, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
280 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
281 "packssdw %%mm5, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
282 "packssdw %%mm7, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
283 "paddw %%mm0, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
284 "paddw %%mm0, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
285 "movq %%mm4, "U_TEMP"(%0) \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
286 "movq %%mm6, "V_TEMP"(%0) \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
287 \ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
288 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
289 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
290 "pxor %%mm1, %%mm1 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
291 "pxor %%mm5, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
292 "pxor %%mm7, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
293 "pxor %%mm6, %%mm6 \n\t"\ |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
294 ASMALIGN(4)\ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
295 "2: \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
296 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
297 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
298 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
299 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
300 "movq %%mm0, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
301 "punpcklwd %%mm4, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
302 "punpckhwd %%mm4, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
303 "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
304 "pmaddwd %%mm4, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
305 "pmaddwd %%mm4, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
306 "paddd %%mm0, %%mm1 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
307 "paddd %%mm3, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
308 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
309 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
310 "add $16, %%"REG_d" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
311 "test %%"REG_S", %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
312 "movq %%mm2, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
313 "punpcklwd %%mm3, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
314 "punpckhwd %%mm3, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
315 "pmaddwd %%mm4, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
316 "pmaddwd %%mm4, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
317 "paddd %%mm2, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
318 "paddd %%mm0, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
319 " jnz 2b \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
320 "psrad $16, %%mm1 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
321 "psrad $16, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
322 "psrad $16, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
323 "psrad $16, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
324 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
325 "packssdw %%mm5, %%mm1 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
326 "packssdw %%mm6, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
327 "paddw %%mm0, %%mm1 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
328 "paddw %%mm0, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
329 "movq "U_TEMP"(%0), %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
330 "movq "V_TEMP"(%0), %%mm4 \n\t"\ |
18861 | 331 |
19173 | 332 #define YSCALEYUV2RGBX \ |
18861 | 333 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ |
334 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ | |
335 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
336 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
337 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ | |
338 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ | |
339 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
340 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ | |
341 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ | |
342 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ | |
343 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ | |
344 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ | |
345 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ | |
346 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
347 "paddw %%mm3, %%mm4 \n\t"\ | |
348 "movq %%mm2, %%mm0 \n\t"\ | |
349 "movq %%mm5, %%mm6 \n\t"\ | |
350 "movq %%mm4, %%mm3 \n\t"\ | |
351 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
352 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
353 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
354 "paddw %%mm1, %%mm2 \n\t"\ | |
355 "paddw %%mm1, %%mm5 \n\t"\ | |
356 "paddw %%mm1, %%mm4 \n\t"\ | |
357 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
358 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
359 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
360 "paddw %%mm7, %%mm0 \n\t"\ | |
361 "paddw %%mm7, %%mm6 \n\t"\ | |
362 "paddw %%mm7, %%mm3 \n\t"\ | |
363 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
364 "packuswb %%mm0, %%mm2 \n\t"\ | |
365 "packuswb %%mm6, %%mm5 \n\t"\ | |
366 "packuswb %%mm3, %%mm4 \n\t"\ | |
367 "pxor %%mm7, %%mm7 \n\t" | |
368 #if 0 | |
369 #define FULL_YSCALEYUV2RGB \ | |
370 "pxor %%mm7, %%mm7 \n\t"\ | |
371 "movd %6, %%mm6 \n\t" /*yalpha1*/\ | |
372 "punpcklwd %%mm6, %%mm6 \n\t"\ | |
373 "punpcklwd %%mm6, %%mm6 \n\t"\ | |
374 "movd %7, %%mm5 \n\t" /*uvalpha1*/\ | |
375 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
376 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
377 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
378 ASMALIGN(4)\ |
18861 | 379 "1: \n\t"\ |
380 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
381 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
382 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
383 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
384 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
385 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
386 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
387 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
388 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
389 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
390 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
391 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
392 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |
393 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
394 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
395 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\ | |
396 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\ | |
397 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\ | |
398 \ | |
399 \ | |
400 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
401 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
402 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\ | |
403 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
404 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\ | |
405 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
406 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\ | |
407 \ | |
408 \ | |
409 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | |
410 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\ | |
411 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\ | |
412 "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |
413 "paddw %%mm1, %%mm0 \n\t" /* R*/\ | |
414 "packuswb %%mm3, %%mm3 \n\t"\ | |
415 \ | |
416 "packuswb %%mm0, %%mm0 \n\t"\ | |
417 "paddw %%mm4, %%mm2 \n\t"\ | |
418 "paddw %%mm2, %%mm1 \n\t" /* G*/\ | |
419 \ | |
420 "packuswb %%mm1, %%mm1 \n\t" | |
421 #endif | |
422 | |
423 #define REAL_YSCALEYUV2PACKED(index, c) \ | |
424 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ | |
425 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\ | |
426 "psraw $3, %%mm0 \n\t"\ | |
427 "psraw $3, %%mm1 \n\t"\ | |
428 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\ | |
429 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\ | |
430 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
431 ASMALIGN(4)\ |
18861 | 432 "1: \n\t"\ |
433 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
434 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
435 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
436 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
437 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
438 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
439 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ | |
440 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
441 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
442 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
443 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
444 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
445 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
446 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
447 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
448 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ | |
449 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ | |
450 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
451 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
452 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
453 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
454 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
455 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
456 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
457 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
458 | |
459 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) | |
460 | |
461 #define REAL_YSCALEYUV2RGB(index, c) \ | |
462 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
463 ASMALIGN(4)\ |
18861 | 464 "1: \n\t"\ |
465 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
466 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
467 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\ | |
468 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\ | |
469 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
470 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
471 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ | |
472 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
473 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
474 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
475 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
476 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
477 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
478 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
479 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
480 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
481 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
482 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ | |
483 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
484 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
485 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
486 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
487 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\ | |
488 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\ | |
489 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
490 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
491 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
492 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
493 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
494 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
495 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
496 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
497 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ | |
498 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
499 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
500 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
501 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
502 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
503 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
504 "paddw %%mm3, %%mm4 \n\t"\ | |
505 "movq %%mm2, %%mm0 \n\t"\ | |
506 "movq %%mm5, %%mm6 \n\t"\ | |
507 "movq %%mm4, %%mm3 \n\t"\ | |
508 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
509 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
510 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
511 "paddw %%mm1, %%mm2 \n\t"\ | |
512 "paddw %%mm1, %%mm5 \n\t"\ | |
513 "paddw %%mm1, %%mm4 \n\t"\ | |
514 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
515 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
516 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
517 "paddw %%mm7, %%mm0 \n\t"\ | |
518 "paddw %%mm7, %%mm6 \n\t"\ | |
519 "paddw %%mm7, %%mm3 \n\t"\ | |
520 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
521 "packuswb %%mm0, %%mm2 \n\t"\ | |
522 "packuswb %%mm6, %%mm5 \n\t"\ | |
523 "packuswb %%mm3, %%mm4 \n\t"\ | |
524 "pxor %%mm7, %%mm7 \n\t" | |
525 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c) | |
526 | |
527 #define REAL_YSCALEYUV2PACKED1(index, c) \ | |
528 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
529 ASMALIGN(4)\ |
18861 | 530 "1: \n\t"\ |
531 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ | |
532 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
533 "psraw $7, %%mm3 \n\t" \ | |
534 "psraw $7, %%mm4 \n\t" \ | |
535 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
536 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
537 "psraw $7, %%mm1 \n\t" \ | |
538 "psraw $7, %%mm7 \n\t" \ | |
539 | |
540 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) | |
541 | |
542 #define REAL_YSCALEYUV2RGB1(index, c) \ | |
543 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
544 ASMALIGN(4)\ |
18861 | 545 "1: \n\t"\ |
546 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ | |
547 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
548 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
549 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
550 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
551 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
552 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
553 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
554 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ | |
555 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
556 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
557 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
558 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
559 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
560 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
561 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ | |
562 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
563 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
564 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
565 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
566 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
567 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
568 "paddw %%mm3, %%mm4 \n\t"\ | |
569 "movq %%mm2, %%mm0 \n\t"\ | |
570 "movq %%mm5, %%mm6 \n\t"\ | |
571 "movq %%mm4, %%mm3 \n\t"\ | |
572 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
573 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
574 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
575 "paddw %%mm1, %%mm2 \n\t"\ | |
576 "paddw %%mm1, %%mm5 \n\t"\ | |
577 "paddw %%mm1, %%mm4 \n\t"\ | |
578 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
579 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
580 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
581 "paddw %%mm7, %%mm0 \n\t"\ | |
582 "paddw %%mm7, %%mm6 \n\t"\ | |
583 "paddw %%mm7, %%mm3 \n\t"\ | |
584 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
585 "packuswb %%mm0, %%mm2 \n\t"\ | |
586 "packuswb %%mm6, %%mm5 \n\t"\ | |
587 "packuswb %%mm3, %%mm4 \n\t"\ | |
588 "pxor %%mm7, %%mm7 \n\t" | |
589 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) | |
590 | |
591 #define REAL_YSCALEYUV2PACKED1b(index, c) \ | |
592 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
593 ASMALIGN(4)\ |
18861 | 594 "1: \n\t"\ |
595 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
596 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
597 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
598 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
599 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ | |
600 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
601 "psrlw $8, %%mm3 \n\t" \ | |
602 "psrlw $8, %%mm4 \n\t" \ | |
603 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
604 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
605 "psraw $7, %%mm1 \n\t" \ | |
606 "psraw $7, %%mm7 \n\t" | |
607 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) | |
608 | |
609 // do vertical chrominance interpolation | |
610 #define REAL_YSCALEYUV2RGB1b(index, c) \ | |
611 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
612 ASMALIGN(4)\ |
18861 | 613 "1: \n\t"\ |
614 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
615 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
616 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
617 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
618 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ | |
619 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
620 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ | |
621 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ | |
622 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
623 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
624 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
625 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
626 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ | |
627 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
628 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
629 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
630 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
631 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
632 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
633 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ | |
634 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
635 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
636 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
637 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
638 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
639 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
640 "paddw %%mm3, %%mm4 \n\t"\ | |
641 "movq %%mm2, %%mm0 \n\t"\ | |
642 "movq %%mm5, %%mm6 \n\t"\ | |
643 "movq %%mm4, %%mm3 \n\t"\ | |
644 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
645 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
646 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
647 "paddw %%mm1, %%mm2 \n\t"\ | |
648 "paddw %%mm1, %%mm5 \n\t"\ | |
649 "paddw %%mm1, %%mm4 \n\t"\ | |
650 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
651 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
652 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
653 "paddw %%mm7, %%mm0 \n\t"\ | |
654 "paddw %%mm7, %%mm6 \n\t"\ | |
655 "paddw %%mm7, %%mm3 \n\t"\ | |
656 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
657 "packuswb %%mm0, %%mm2 \n\t"\ | |
658 "packuswb %%mm6, %%mm5 \n\t"\ | |
659 "packuswb %%mm3, %%mm4 \n\t"\ | |
660 "pxor %%mm7, %%mm7 \n\t" | |
661 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) | |
662 | |
663 #define REAL_WRITEBGR32(dst, dstw, index) \ | |
664 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
665 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
666 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
667 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
668 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
669 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
670 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
671 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
672 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
673 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
674 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
675 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
676 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
677 \ | |
678 MOVNTQ(%%mm0, (dst, index, 4))\ | |
679 MOVNTQ(%%mm2, 8(dst, index, 4))\ | |
680 MOVNTQ(%%mm1, 16(dst, index, 4))\ | |
681 MOVNTQ(%%mm3, 24(dst, index, 4))\ | |
682 \ | |
683 "add $8, "#index" \n\t"\ | |
684 "cmp "#dstw", "#index" \n\t"\ | |
685 " jb 1b \n\t" | |
686 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index) | |
687 | |
688 #define REAL_WRITEBGR16(dst, dstw, index) \ | |
689 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ | |
690 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ | |
691 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
692 "psrlq $3, %%mm2 \n\t"\ | |
693 \ | |
694 "movq %%mm2, %%mm1 \n\t"\ | |
695 "movq %%mm4, %%mm3 \n\t"\ | |
696 \ | |
697 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
698 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
699 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
700 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
701 \ | |
702 "psllq $3, %%mm3 \n\t"\ | |
703 "psllq $3, %%mm4 \n\t"\ | |
704 \ | |
705 "por %%mm3, %%mm2 \n\t"\ | |
706 "por %%mm4, %%mm1 \n\t"\ | |
707 \ | |
708 MOVNTQ(%%mm2, (dst, index, 2))\ | |
709 MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
710 \ | |
711 "add $8, "#index" \n\t"\ | |
712 "cmp "#dstw", "#index" \n\t"\ | |
713 " jb 1b \n\t" | |
714 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index) | |
715 | |
716 #define REAL_WRITEBGR15(dst, dstw, index) \ | |
717 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ | |
718 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ | |
719 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
720 "psrlq $3, %%mm2 \n\t"\ | |
721 "psrlq $1, %%mm5 \n\t"\ | |
722 \ | |
723 "movq %%mm2, %%mm1 \n\t"\ | |
724 "movq %%mm4, %%mm3 \n\t"\ | |
725 \ | |
726 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
727 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
728 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
729 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
730 \ | |
731 "psllq $2, %%mm3 \n\t"\ | |
732 "psllq $2, %%mm4 \n\t"\ | |
733 \ | |
734 "por %%mm3, %%mm2 \n\t"\ | |
735 "por %%mm4, %%mm1 \n\t"\ | |
736 \ | |
737 MOVNTQ(%%mm2, (dst, index, 2))\ | |
738 MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
739 \ | |
740 "add $8, "#index" \n\t"\ | |
741 "cmp "#dstw", "#index" \n\t"\ | |
742 " jb 1b \n\t" | |
743 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index) | |
744 | |
745 #define WRITEBGR24OLD(dst, dstw, index) \ | |
746 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
747 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
748 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
749 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
750 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
751 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
752 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
753 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
754 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
755 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
756 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
757 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
758 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
759 \ | |
760 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
761 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ | |
762 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\ | |
763 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\ | |
764 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
765 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ | |
766 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ | |
767 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
768 \ | |
769 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
770 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ | |
771 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ | |
772 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ | |
773 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\ | |
774 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ | |
775 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ | |
776 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\ | |
777 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\ | |
778 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ | |
779 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ | |
780 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ | |
781 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ | |
782 \ | |
783 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ | |
784 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ | |
785 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ | |
786 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\ | |
787 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\ | |
788 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ | |
789 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
790 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ | |
791 \ | |
792 MOVNTQ(%%mm0, (dst))\ | |
793 MOVNTQ(%%mm2, 8(dst))\ | |
794 MOVNTQ(%%mm3, 16(dst))\ | |
795 "add $24, "#dst" \n\t"\ | |
796 \ | |
797 "add $8, "#index" \n\t"\ | |
798 "cmp "#dstw", "#index" \n\t"\ | |
799 " jb 1b \n\t" | |
800 | |
801 #define WRITEBGR24MMX(dst, dstw, index) \ | |
802 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
803 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
804 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
805 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
806 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
807 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
808 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
809 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
810 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
811 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
812 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
813 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
814 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
815 \ | |
816 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
817 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ | |
818 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ | |
819 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ | |
820 \ | |
821 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ | |
822 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ | |
823 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ | |
824 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ | |
825 \ | |
826 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ | |
827 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ | |
828 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ | |
829 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ | |
830 \ | |
831 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
832 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ | |
833 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ | |
834 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
835 MOVNTQ(%%mm0, (dst))\ | |
836 \ | |
837 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ | |
838 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ | |
839 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ | |
840 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ | |
841 MOVNTQ(%%mm6, 8(dst))\ | |
842 \ | |
843 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ | |
844 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
845 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ | |
846 MOVNTQ(%%mm5, 16(dst))\ | |
847 \ | |
848 "add $24, "#dst" \n\t"\ | |
849 \ | |
850 "add $8, "#index" \n\t"\ | |
851 "cmp "#dstw", "#index" \n\t"\ | |
852 " jb 1b \n\t" | |
853 | |
854 #define WRITEBGR24MMX2(dst, dstw, index) \ | |
855 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
856 "movq "MANGLE(M24A)", %%mm0 \n\t"\ | |
857 "movq "MANGLE(M24C)", %%mm7 \n\t"\ | |
858 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ | |
859 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ | |
860 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ | |
861 \ | |
862 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ | |
863 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ | |
864 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ | |
865 \ | |
866 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ | |
867 "por %%mm1, %%mm6 \n\t"\ | |
868 "por %%mm3, %%mm6 \n\t"\ | |
869 MOVNTQ(%%mm6, (dst))\ | |
870 \ | |
871 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ | |
872 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ | |
873 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ | |
874 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ | |
875 \ | |
876 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ | |
877 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ | |
878 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ | |
879 \ | |
880 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ | |
881 "por %%mm3, %%mm6 \n\t"\ | |
882 MOVNTQ(%%mm6, 8(dst))\ | |
883 \ | |
884 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ | |
885 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ | |
886 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ | |
887 \ | |
888 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ | |
889 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ | |
890 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ | |
891 \ | |
892 "por %%mm1, %%mm3 \n\t"\ | |
893 "por %%mm3, %%mm6 \n\t"\ | |
894 MOVNTQ(%%mm6, 16(dst))\ | |
895 \ | |
896 "add $24, "#dst" \n\t"\ | |
897 \ | |
898 "add $8, "#index" \n\t"\ | |
899 "cmp "#dstw", "#index" \n\t"\ | |
900 " jb 1b \n\t" | |
901 | |
902 #ifdef HAVE_MMX2 | |
903 #undef WRITEBGR24 | |
904 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index) | |
905 #else | |
906 #undef WRITEBGR24 | |
907 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) | |
908 #endif | |
909 | |
910 #define REAL_WRITEYUY2(dst, dstw, index) \ | |
911 "packuswb %%mm3, %%mm3 \n\t"\ | |
912 "packuswb %%mm4, %%mm4 \n\t"\ | |
913 "packuswb %%mm7, %%mm1 \n\t"\ | |
914 "punpcklbw %%mm4, %%mm3 \n\t"\ | |
915 "movq %%mm1, %%mm7 \n\t"\ | |
916 "punpcklbw %%mm3, %%mm1 \n\t"\ | |
917 "punpckhbw %%mm3, %%mm7 \n\t"\ | |
918 \ | |
919 MOVNTQ(%%mm1, (dst, index, 2))\ | |
920 MOVNTQ(%%mm7, 8(dst, index, 2))\ | |
921 \ | |
922 "add $8, "#index" \n\t"\ | |
923 "cmp "#dstw", "#index" \n\t"\ | |
924 " jb 1b \n\t" | |
925 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) | |
926 | |
927 | |
928 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | |
929 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
930 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) | |
931 { | |
932 #ifdef HAVE_MMX | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
933 if(c->flags & SWS_ACCURATE_RND){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
934 if(uDest){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
935 YSCALEYUV2YV12X_ACCURATE( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
936 YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
937 } |
18861 | 938 |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
939 YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
940 }else{ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
941 if(uDest){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
942 YSCALEYUV2YV12X( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
943 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
944 } |
18861 | 945 |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
946 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
947 } |
18861 | 948 #else |
949 #ifdef HAVE_ALTIVEC | |
950 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, | |
951 chrFilter, chrSrc, chrFilterSize, | |
952 dest, uDest, vDest, dstW, chrDstW); | |
953 #else //HAVE_ALTIVEC | |
954 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, | |
955 chrFilter, chrSrc, chrFilterSize, | |
956 dest, uDest, vDest, dstW, chrDstW); | |
957 #endif //!HAVE_ALTIVEC | |
958 #endif | |
959 } | |
960 | |
961 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | |
962 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
963 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat) | |
964 { | |
965 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, | |
966 chrFilter, chrSrc, chrFilterSize, | |
967 dest, uDest, dstW, chrDstW, dstFormat); | |
968 } | |
969 | |
970 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc, | |
971 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) | |
972 { | |
973 #ifdef HAVE_MMX | |
974 if(uDest != NULL) | |
975 { | |
976 asm volatile( | |
977 YSCALEYUV2YV121 | |
978 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW), | |
979 "g" (-chrDstW) | |
980 : "%"REG_a | |
981 ); | |
982 | |
983 asm volatile( | |
984 YSCALEYUV2YV121 | |
985 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW), | |
986 "g" (-chrDstW) | |
987 : "%"REG_a | |
988 ); | |
989 } | |
990 | |
991 asm volatile( | |
992 YSCALEYUV2YV121 | |
993 :: "r" (lumSrc + dstW), "r" (dest + dstW), | |
994 "g" (-dstW) | |
995 : "%"REG_a | |
996 ); | |
997 #else | |
998 int i; | |
999 for(i=0; i<dstW; i++) | |
1000 { | |
1001 int val= lumSrc[i]>>7; | |
1002 | |
1003 if(val&256){ | |
1004 if(val<0) val=0; | |
1005 else val=255; | |
1006 } | |
1007 | |
1008 dest[i]= val; | |
1009 } | |
1010 | |
1011 if(uDest != NULL) | |
1012 for(i=0; i<chrDstW; i++) | |
1013 { | |
1014 int u=chrSrc[i]>>7; | |
1015 int v=chrSrc[i + 2048]>>7; | |
1016 | |
1017 if((u|v)&256){ | |
1018 if(u<0) u=0; | |
1019 else if (u>255) u=255; | |
1020 if(v<0) v=0; | |
1021 else if (v>255) v=255; | |
1022 } | |
1023 | |
1024 uDest[i]= u; | |
1025 vDest[i]= v; | |
1026 } | |
1027 #endif | |
1028 } | |
1029 | |
1030 | |
1031 /** | |
1032 * vertical scale YV12 to RGB | |
1033 */ | |
1034 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | |
1035 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
1036 uint8_t *dest, long dstW, long dstY) | |
1037 { | |
1038 long dummy=0; | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1039 #ifdef HAVE_MMX |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1040 if(c->flags & SWS_ACCURATE_RND){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1041 switch(c->dstFormat){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1042 case IMGFMT_BGR32: |
19173 | 1043 YSCALEYUV2PACKEDX_ACCURATE |
1044 YSCALEYUV2RGBX | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1045 WRITEBGR32(%4, %5, %%REGa) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1046 |
19173 | 1047 YSCALEYUV2PACKEDX_END |
1048 return; | |
1049 case IMGFMT_BGR24: | |
1050 YSCALEYUV2PACKEDX_ACCURATE | |
1051 YSCALEYUV2RGBX | |
19396 | 1052 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize |
1053 "add %4, %%"REG_c" \n\t" | |
1054 WRITEBGR24(%%REGc, %5, %%REGa) | |
19173 | 1055 |
1056 | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1057 :: "r" (&c->redDither), |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1058 "m" (dummy), "m" (dummy), "m" (dummy), |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1059 "r" (dest), "m" (dstW) |
19396 | 1060 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1061 ); |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1062 return; |
19173 | 1063 case IMGFMT_BGR15: |
1064 YSCALEYUV2PACKEDX_ACCURATE | |
1065 YSCALEYUV2RGBX | |
1066 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1067 #ifdef DITHER1XBPP | |
1068 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1069 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1070 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1071 #endif | |
1072 | |
1073 WRITEBGR15(%4, %5, %%REGa) | |
1074 YSCALEYUV2PACKEDX_END | |
1075 return; | |
1076 case IMGFMT_BGR16: | |
1077 YSCALEYUV2PACKEDX_ACCURATE | |
1078 YSCALEYUV2RGBX | |
1079 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1080 #ifdef DITHER1XBPP | |
1081 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1082 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1083 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1084 #endif | |
1085 | |
1086 WRITEBGR16(%4, %5, %%REGa) | |
1087 YSCALEYUV2PACKEDX_END | |
1088 return; | |
1089 case IMGFMT_YUY2: | |
1090 YSCALEYUV2PACKEDX_ACCURATE | |
1091 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1092 | |
1093 "psraw $3, %%mm3 \n\t" | |
1094 "psraw $3, %%mm4 \n\t" | |
1095 "psraw $3, %%mm1 \n\t" | |
1096 "psraw $3, %%mm7 \n\t" | |
1097 WRITEYUY2(%4, %5, %%REGa) | |
1098 YSCALEYUV2PACKEDX_END | |
1099 return; | |
1100 } | |
1101 }else{ | |
1102 switch(c->dstFormat) | |
1103 { | |
1104 case IMGFMT_BGR32: | |
1105 YSCALEYUV2PACKEDX | |
1106 YSCALEYUV2RGBX | |
1107 WRITEBGR32(%4, %5, %%REGa) | |
1108 YSCALEYUV2PACKEDX_END | |
1109 return; | |
1110 case IMGFMT_BGR24: | |
1111 YSCALEYUV2PACKEDX | |
1112 YSCALEYUV2RGBX | |
19396 | 1113 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize |
1114 "add %4, %%"REG_c" \n\t" | |
1115 WRITEBGR24(%%REGc, %5, %%REGa) | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1116 |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1117 :: "r" (&c->redDither), |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1118 "m" (dummy), "m" (dummy), "m" (dummy), |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1119 "r" (dest), "m" (dstW) |
19396 | 1120 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1121 ); |
19173 | 1122 return; |
1123 case IMGFMT_BGR15: | |
1124 YSCALEYUV2PACKEDX | |
1125 YSCALEYUV2RGBX | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1126 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1127 #ifdef DITHER1XBPP |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1128 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1129 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1130 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1131 #endif |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1132 |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1133 WRITEBGR15(%4, %5, %%REGa) |
19173 | 1134 YSCALEYUV2PACKEDX_END |
1135 return; | |
1136 case IMGFMT_BGR16: | |
1137 YSCALEYUV2PACKEDX | |
1138 YSCALEYUV2RGBX | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1139 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1140 #ifdef DITHER1XBPP |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1141 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1142 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1143 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1144 #endif |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1145 |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1146 WRITEBGR16(%4, %5, %%REGa) |
19173 | 1147 YSCALEYUV2PACKEDX_END |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1148 return; |
18861 | 1149 case IMGFMT_YUY2: |
1150 YSCALEYUV2PACKEDX | |
1151 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1152 | |
1153 "psraw $3, %%mm3 \n\t" | |
1154 "psraw $3, %%mm4 \n\t" | |
1155 "psraw $3, %%mm1 \n\t" | |
1156 "psraw $3, %%mm7 \n\t" | |
1157 WRITEYUY2(%4, %5, %%REGa) | |
19173 | 1158 YSCALEYUV2PACKEDX_END |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1159 return; |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1160 } |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1161 } |
18861 | 1162 #endif |
1163 #ifdef HAVE_ALTIVEC | |
1164 /* The following list of supported dstFormat values should | |
1165 match what's found in the body of altivec_yuv2packedX() */ | |
1166 if(c->dstFormat==IMGFMT_ABGR || c->dstFormat==IMGFMT_BGRA || | |
1167 c->dstFormat==IMGFMT_BGR24 || c->dstFormat==IMGFMT_RGB24 || | |
1168 c->dstFormat==IMGFMT_RGBA || c->dstFormat==IMGFMT_ARGB) | |
1169 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize, | |
1170 chrFilter, chrSrc, chrFilterSize, | |
1171 dest, dstW, dstY); | |
1172 else | |
1173 #endif | |
1174 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, | |
1175 chrFilter, chrSrc, chrFilterSize, | |
1176 dest, dstW, dstY); | |
1177 } | |
1178 | |
1179 /** | |
1180 * vertical bilinear scale YV12 to RGB | |
1181 */ | |
1182 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, | |
1183 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) | |
1184 { | |
1185 int yalpha1=yalpha^4095; | |
1186 int uvalpha1=uvalpha^4095; | |
1187 int i; | |
1188 | |
1189 #if 0 //isn't used | |
1190 if(flags&SWS_FULL_CHR_H_INT) | |
1191 { | |
1192 switch(dstFormat) | |
1193 { | |
1194 #ifdef HAVE_MMX | |
1195 case IMGFMT_BGR32: | |
1196 asm volatile( | |
1197 | |
1198 | |
1199 FULL_YSCALEYUV2RGB | |
1200 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
1201 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
1202 | |
1203 "movq %%mm3, %%mm1 \n\t" | |
1204 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
1205 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
1206 | |
1207 MOVNTQ(%%mm3, (%4, %%REGa, 4)) | |
1208 MOVNTQ(%%mm1, 8(%4, %%REGa, 4)) | |
1209 | |
1210 "add $4, %%"REG_a" \n\t" | |
1211 "cmp %5, %%"REG_a" \n\t" | |
1212 " jb 1b \n\t" | |
1213 | |
1214 | |
1215 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW), | |
1216 "m" (yalpha1), "m" (uvalpha1) | |
1217 : "%"REG_a | |
1218 ); | |
1219 break; | |
1220 case IMGFMT_BGR24: | |
1221 asm volatile( | |
1222 | |
1223 FULL_YSCALEYUV2RGB | |
1224 | |
1225 // lsb ... msb | |
1226 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
1227 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
1228 | |
1229 "movq %%mm3, %%mm1 \n\t" | |
1230 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
1231 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
1232 | |
1233 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0 | |
1234 "psrlq $8, %%mm3 \n\t" // GR0BGR00 | |
1235 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000 | |
1236 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00 | |
1237 "por %%mm2, %%mm3 \n\t" // BGRBGR00 | |
1238 "movq %%mm1, %%mm2 \n\t" | |
1239 "psllq $48, %%mm1 \n\t" // 000000BG | |
1240 "por %%mm1, %%mm3 \n\t" // BGRBGRBG | |
1241 | |
1242 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0 | |
1243 "psrld $16, %%mm2 \n\t" // R000R000 | |
1244 "psrlq $24, %%mm1 \n\t" // 0BGR0000 | |
1245 "por %%mm2, %%mm1 \n\t" // RBGRR000 | |
1246 | |
1247 "mov %4, %%"REG_b" \n\t" | |
1248 "add %%"REG_a", %%"REG_b" \n\t" | |
1249 | |
1250 #ifdef HAVE_MMX2 | |
1251 //FIXME Alignment | |
1252 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t" | |
1253 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t" | |
1254 #else | |
1255 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t" | |
1256 "psrlq $32, %%mm3 \n\t" | |
1257 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t" | |
1258 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t" | |
1259 #endif | |
1260 "add $4, %%"REG_a" \n\t" | |
1261 "cmp %5, %%"REG_a" \n\t" | |
1262 " jb 1b \n\t" | |
1263 | |
1264 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), | |
1265 "m" (yalpha1), "m" (uvalpha1) | |
1266 : "%"REG_a, "%"REG_b | |
1267 ); | |
1268 break; | |
1269 case IMGFMT_BGR15: | |
1270 asm volatile( | |
1271 | |
1272 FULL_YSCALEYUV2RGB | |
1273 #ifdef DITHER1XBPP | |
1274 "paddusb "MANGLE(g5Dither)", %%mm1\n\t" | |
1275 "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
1276 "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
1277 #endif | |
1278 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |
1279 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
1280 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
1281 | |
1282 "psrlw $3, %%mm3 \n\t" | |
1283 "psllw $2, %%mm1 \n\t" | |
1284 "psllw $7, %%mm0 \n\t" | |
1285 "pand "MANGLE(g15Mask)", %%mm1 \n\t" | |
1286 "pand "MANGLE(r15Mask)", %%mm0 \n\t" | |
1287 | |
1288 "por %%mm3, %%mm1 \n\t" | |
1289 "por %%mm1, %%mm0 \n\t" | |
1290 | |
1291 MOVNTQ(%%mm0, (%4, %%REGa, 2)) | |
1292 | |
1293 "add $4, %%"REG_a" \n\t" | |
1294 "cmp %5, %%"REG_a" \n\t" | |
1295 " jb 1b \n\t" | |
1296 | |
1297 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), | |
1298 "m" (yalpha1), "m" (uvalpha1) | |
1299 : "%"REG_a | |
1300 ); | |
1301 break; | |
1302 case IMGFMT_BGR16: | |
1303 asm volatile( | |
1304 | |
1305 FULL_YSCALEYUV2RGB | |
1306 #ifdef DITHER1XBPP | |
1307 "paddusb "MANGLE(g6Dither)", %%mm1\n\t" | |
1308 "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
1309 "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
1310 #endif | |
1311 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |
1312 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
1313 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
1314 | |
1315 "psrlw $3, %%mm3 \n\t" | |
1316 "psllw $3, %%mm1 \n\t" | |
1317 "psllw $8, %%mm0 \n\t" | |
1318 "pand "MANGLE(g16Mask)", %%mm1 \n\t" | |
1319 "pand "MANGLE(r16Mask)", %%mm0 \n\t" | |
1320 | |
1321 "por %%mm3, %%mm1 \n\t" | |
1322 "por %%mm1, %%mm0 \n\t" | |
1323 | |
1324 MOVNTQ(%%mm0, (%4, %%REGa, 2)) | |
1325 | |
1326 "add $4, %%"REG_a" \n\t" | |
1327 "cmp %5, %%"REG_a" \n\t" | |
1328 " jb 1b \n\t" | |
1329 | |
1330 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), | |
1331 "m" (yalpha1), "m" (uvalpha1) | |
1332 : "%"REG_a | |
1333 ); | |
1334 break; | |
1335 #endif | |
1336 case IMGFMT_RGB32: | |
1337 #ifndef HAVE_MMX | |
1338 case IMGFMT_BGR32: | |
1339 #endif | |
1340 if(dstFormat==IMGFMT_BGR32) | |
1341 { | |
1342 int i; | |
1343 #ifdef WORDS_BIGENDIAN | |
1344 dest++; | |
1345 #endif | |
1346 for(i=0;i<dstW;i++){ | |
1347 // vertical linear interpolation && yuv2rgb in a single step: | |
1348 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1349 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1350 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1351 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; | |
1352 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1353 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
1354 dest+= 4; | |
1355 } | |
1356 } | |
1357 else if(dstFormat==IMGFMT_BGR24) | |
1358 { | |
1359 int i; | |
1360 for(i=0;i<dstW;i++){ | |
1361 // vertical linear interpolation && yuv2rgb in a single step: | |
1362 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1363 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1364 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1365 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; | |
1366 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1367 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
1368 dest+= 3; | |
1369 } | |
1370 } | |
1371 else if(dstFormat==IMGFMT_BGR16) | |
1372 { | |
1373 int i; | |
1374 for(i=0;i<dstW;i++){ | |
1375 // vertical linear interpolation && yuv2rgb in a single step: | |
1376 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1377 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1378 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1379 | |
1380 ((uint16_t*)dest)[i] = | |
1381 clip_table16b[(Y + yuvtab_40cf[U]) >>13] | | |
1382 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1383 clip_table16r[(Y + yuvtab_3343[V]) >>13]; | |
1384 } | |
1385 } | |
1386 else if(dstFormat==IMGFMT_BGR15) | |
1387 { | |
1388 int i; | |
1389 for(i=0;i<dstW;i++){ | |
1390 // vertical linear interpolation && yuv2rgb in a single step: | |
1391 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1392 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1393 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1394 | |
1395 ((uint16_t*)dest)[i] = | |
1396 clip_table15b[(Y + yuvtab_40cf[U]) >>13] | | |
1397 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1398 clip_table15r[(Y + yuvtab_3343[V]) >>13]; | |
1399 } | |
1400 } | |
1401 }//FULL_UV_IPOL | |
1402 else | |
1403 { | |
1404 #endif // if 0 | |
1405 #ifdef HAVE_MMX | |
1406 switch(c->dstFormat) | |
1407 { | |
1408 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( | |
1409 case IMGFMT_BGR32: | |
1410 asm volatile( | |
1411 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1412 "mov %4, %%"REG_b" \n\t" | |
1413 "push %%"REG_BP" \n\t" | |
1414 YSCALEYUV2RGB(%%REGBP, %5) | |
1415 WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1416 "pop %%"REG_BP" \n\t" | |
1417 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1418 | |
1419 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1420 "a" (&c->redDither) | |
1421 ); | |
1422 return; | |
1423 case IMGFMT_BGR24: | |
1424 asm volatile( | |
1425 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1426 "mov %4, %%"REG_b" \n\t" | |
1427 "push %%"REG_BP" \n\t" | |
1428 YSCALEYUV2RGB(%%REGBP, %5) | |
1429 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1430 "pop %%"REG_BP" \n\t" | |
1431 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1432 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1433 "a" (&c->redDither) | |
1434 ); | |
1435 return; | |
1436 case IMGFMT_BGR15: | |
1437 asm volatile( | |
1438 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1439 "mov %4, %%"REG_b" \n\t" | |
1440 "push %%"REG_BP" \n\t" | |
1441 YSCALEYUV2RGB(%%REGBP, %5) | |
1442 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1443 #ifdef DITHER1XBPP | |
1444 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1445 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1446 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1447 #endif | |
1448 | |
1449 WRITEBGR15(%%REGb, 8280(%5), %%REGBP) | |
1450 "pop %%"REG_BP" \n\t" | |
1451 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1452 | |
1453 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1454 "a" (&c->redDither) | |
1455 ); | |
1456 return; | |
1457 case IMGFMT_BGR16: | |
1458 asm volatile( | |
1459 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1460 "mov %4, %%"REG_b" \n\t" | |
1461 "push %%"REG_BP" \n\t" | |
1462 YSCALEYUV2RGB(%%REGBP, %5) | |
1463 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1464 #ifdef DITHER1XBPP | |
1465 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1466 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1467 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1468 #endif | |
1469 | |
1470 WRITEBGR16(%%REGb, 8280(%5), %%REGBP) | |
1471 "pop %%"REG_BP" \n\t" | |
1472 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1473 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1474 "a" (&c->redDither) | |
1475 ); | |
1476 return; | |
1477 case IMGFMT_YUY2: | |
1478 asm volatile( | |
1479 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1480 "mov %4, %%"REG_b" \n\t" | |
1481 "push %%"REG_BP" \n\t" | |
1482 YSCALEYUV2PACKED(%%REGBP, %5) | |
1483 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1484 "pop %%"REG_BP" \n\t" | |
1485 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1486 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1487 "a" (&c->redDither) | |
1488 ); | |
1489 return; | |
1490 default: break; | |
1491 } | |
1492 #endif //HAVE_MMX | |
1493 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C) | |
1494 } | |
1495 | |
1496 /** | |
1497 * YV12 to RGB without scaling or interpolating | |
1498 */ | |
1499 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, | |
1500 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y) | |
1501 { | |
1502 const int yalpha1=0; | |
1503 int i; | |
1504 | |
1505 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1 | |
1506 const int yalpha= 4096; //FIXME ... | |
1507 | |
1508 if(flags&SWS_FULL_CHR_H_INT) | |
1509 { | |
1510 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y); | |
1511 return; | |
1512 } | |
1513 | |
1514 #ifdef HAVE_MMX | |
1515 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster | |
1516 { | |
1517 switch(dstFormat) | |
1518 { | |
1519 case IMGFMT_BGR32: | |
1520 asm volatile( | |
1521 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1522 "mov %4, %%"REG_b" \n\t" | |
1523 "push %%"REG_BP" \n\t" | |
1524 YSCALEYUV2RGB1(%%REGBP, %5) | |
1525 WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1526 "pop %%"REG_BP" \n\t" | |
1527 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1528 | |
1529 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1530 "a" (&c->redDither) | |
1531 ); | |
1532 return; | |
1533 case IMGFMT_BGR24: | |
1534 asm volatile( | |
1535 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1536 "mov %4, %%"REG_b" \n\t" | |
1537 "push %%"REG_BP" \n\t" | |
1538 YSCALEYUV2RGB1(%%REGBP, %5) | |
1539 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1540 "pop %%"REG_BP" \n\t" | |
1541 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1542 | |
1543 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1544 "a" (&c->redDither) | |
1545 ); | |
1546 return; | |
1547 case IMGFMT_BGR15: | |
1548 asm volatile( | |
1549 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1550 "mov %4, %%"REG_b" \n\t" | |
1551 "push %%"REG_BP" \n\t" | |
1552 YSCALEYUV2RGB1(%%REGBP, %5) | |
1553 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1554 #ifdef DITHER1XBPP | |
1555 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1556 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1557 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1558 #endif | |
1559 WRITEBGR15(%%REGb, 8280(%5), %%REGBP) | |
1560 "pop %%"REG_BP" \n\t" | |
1561 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1562 | |
1563 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1564 "a" (&c->redDither) | |
1565 ); | |
1566 return; | |
1567 case IMGFMT_BGR16: | |
1568 asm volatile( | |
1569 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1570 "mov %4, %%"REG_b" \n\t" | |
1571 "push %%"REG_BP" \n\t" | |
1572 YSCALEYUV2RGB1(%%REGBP, %5) | |
1573 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1574 #ifdef DITHER1XBPP | |
1575 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1576 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1577 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1578 #endif | |
1579 | |
1580 WRITEBGR16(%%REGb, 8280(%5), %%REGBP) | |
1581 "pop %%"REG_BP" \n\t" | |
1582 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1583 | |
1584 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1585 "a" (&c->redDither) | |
1586 ); | |
1587 return; | |
1588 case IMGFMT_YUY2: | |
1589 asm volatile( | |
1590 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1591 "mov %4, %%"REG_b" \n\t" | |
1592 "push %%"REG_BP" \n\t" | |
1593 YSCALEYUV2PACKED1(%%REGBP, %5) | |
1594 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1595 "pop %%"REG_BP" \n\t" | |
1596 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1597 | |
1598 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1599 "a" (&c->redDither) | |
1600 ); | |
1601 return; | |
1602 } | |
1603 } | |
1604 else | |
1605 { | |
1606 switch(dstFormat) | |
1607 { | |
1608 case IMGFMT_BGR32: | |
1609 asm volatile( | |
1610 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1611 "mov %4, %%"REG_b" \n\t" | |
1612 "push %%"REG_BP" \n\t" | |
1613 YSCALEYUV2RGB1b(%%REGBP, %5) | |
1614 WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1615 "pop %%"REG_BP" \n\t" | |
1616 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1617 | |
1618 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1619 "a" (&c->redDither) | |
1620 ); | |
1621 return; | |
1622 case IMGFMT_BGR24: | |
1623 asm volatile( | |
1624 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1625 "mov %4, %%"REG_b" \n\t" | |
1626 "push %%"REG_BP" \n\t" | |
1627 YSCALEYUV2RGB1b(%%REGBP, %5) | |
1628 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1629 "pop %%"REG_BP" \n\t" | |
1630 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1631 | |
1632 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1633 "a" (&c->redDither) | |
1634 ); | |
1635 return; | |
1636 case IMGFMT_BGR15: | |
1637 asm volatile( | |
1638 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1639 "mov %4, %%"REG_b" \n\t" | |
1640 "push %%"REG_BP" \n\t" | |
1641 YSCALEYUV2RGB1b(%%REGBP, %5) | |
1642 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1643 #ifdef DITHER1XBPP | |
1644 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1645 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1646 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1647 #endif | |
1648 WRITEBGR15(%%REGb, 8280(%5), %%REGBP) | |
1649 "pop %%"REG_BP" \n\t" | |
1650 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1651 | |
1652 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1653 "a" (&c->redDither) | |
1654 ); | |
1655 return; | |
1656 case IMGFMT_BGR16: | |
1657 asm volatile( | |
1658 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1659 "mov %4, %%"REG_b" \n\t" | |
1660 "push %%"REG_BP" \n\t" | |
1661 YSCALEYUV2RGB1b(%%REGBP, %5) | |
1662 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1663 #ifdef DITHER1XBPP | |
1664 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1665 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1666 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1667 #endif | |
1668 | |
1669 WRITEBGR16(%%REGb, 8280(%5), %%REGBP) | |
1670 "pop %%"REG_BP" \n\t" | |
1671 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1672 | |
1673 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1674 "a" (&c->redDither) | |
1675 ); | |
1676 return; | |
1677 case IMGFMT_YUY2: | |
1678 asm volatile( | |
1679 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1680 "mov %4, %%"REG_b" \n\t" | |
1681 "push %%"REG_BP" \n\t" | |
1682 YSCALEYUV2PACKED1b(%%REGBP, %5) | |
1683 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1684 "pop %%"REG_BP" \n\t" | |
1685 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1686 | |
1687 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1688 "a" (&c->redDither) | |
1689 ); | |
1690 return; | |
1691 } | |
1692 } | |
1693 #endif | |
1694 if( uvalpha < 2048 ) | |
1695 { | |
1696 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C) | |
1697 }else{ | |
1698 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C) | |
1699 } | |
1700 } | |
1701 | |
1702 //FIXME yuy2* can read upto 7 samples to much | |
1703 | |
1704 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width) | |
1705 { | |
1706 #ifdef HAVE_MMX | |
1707 asm volatile( | |
1708 "movq "MANGLE(bm01010101)", %%mm2\n\t" | |
1709 "mov %0, %%"REG_a" \n\t" | |
1710 "1: \n\t" | |
1711 "movq (%1, %%"REG_a",2), %%mm0 \n\t" | |
1712 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |
1713 "pand %%mm2, %%mm0 \n\t" | |
1714 "pand %%mm2, %%mm1 \n\t" | |
1715 "packuswb %%mm1, %%mm0 \n\t" | |
1716 "movq %%mm0, (%2, %%"REG_a") \n\t" | |
1717 "add $8, %%"REG_a" \n\t" | |
1718 " js 1b \n\t" | |
1719 : : "g" (-width), "r" (src+width*2), "r" (dst+width) | |
1720 : "%"REG_a | |
1721 ); | |
1722 #else | |
1723 int i; | |
1724 for(i=0; i<width; i++) | |
1725 dst[i]= src[2*i]; | |
1726 #endif | |
1727 } | |
1728 | |
1729 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) | |
1730 { | |
1731 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1732 asm volatile( | |
1733 "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
1734 "mov %0, %%"REG_a" \n\t" | |
1735 "1: \n\t" | |
1736 "movq (%1, %%"REG_a",4), %%mm0 \n\t" | |
1737 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | |
1738 "movq (%2, %%"REG_a",4), %%mm2 \n\t" | |
1739 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t" | |
1740 PAVGB(%%mm2, %%mm0) | |
1741 PAVGB(%%mm3, %%mm1) | |
1742 "psrlw $8, %%mm0 \n\t" | |
1743 "psrlw $8, %%mm1 \n\t" | |
1744 "packuswb %%mm1, %%mm0 \n\t" | |
1745 "movq %%mm0, %%mm1 \n\t" | |
1746 "psrlw $8, %%mm0 \n\t" | |
1747 "pand %%mm4, %%mm1 \n\t" | |
1748 "packuswb %%mm0, %%mm0 \n\t" | |
1749 "packuswb %%mm1, %%mm1 \n\t" | |
1750 "movd %%mm0, (%4, %%"REG_a") \n\t" | |
1751 "movd %%mm1, (%3, %%"REG_a") \n\t" | |
1752 "add $4, %%"REG_a" \n\t" | |
1753 " js 1b \n\t" | |
1754 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width) | |
1755 : "%"REG_a | |
1756 ); | |
1757 #else | |
1758 int i; | |
1759 for(i=0; i<width; i++) | |
1760 { | |
1761 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1; | |
1762 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1; | |
1763 } | |
1764 #endif | |
1765 } | |
1766 | |
1767 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses | |
1768 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width) | |
1769 { | |
1770 #ifdef HAVE_MMX | |
1771 asm volatile( | |
1772 "mov %0, %%"REG_a" \n\t" | |
1773 "1: \n\t" | |
1774 "movq (%1, %%"REG_a",2), %%mm0 \n\t" | |
1775 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |
1776 "psrlw $8, %%mm0 \n\t" | |
1777 "psrlw $8, %%mm1 \n\t" | |
1778 "packuswb %%mm1, %%mm0 \n\t" | |
1779 "movq %%mm0, (%2, %%"REG_a") \n\t" | |
1780 "add $8, %%"REG_a" \n\t" | |
1781 " js 1b \n\t" | |
1782 : : "g" (-width), "r" (src+width*2), "r" (dst+width) | |
1783 : "%"REG_a | |
1784 ); | |
1785 #else | |
1786 int i; | |
1787 for(i=0; i<width; i++) | |
1788 dst[i]= src[2*i+1]; | |
1789 #endif | |
1790 } | |
1791 | |
1792 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) | |
1793 { | |
1794 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1795 asm volatile( | |
1796 "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
1797 "mov %0, %%"REG_a" \n\t" | |
1798 "1: \n\t" | |
1799 "movq (%1, %%"REG_a",4), %%mm0 \n\t" | |
1800 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | |
1801 "movq (%2, %%"REG_a",4), %%mm2 \n\t" | |
1802 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t" | |
1803 PAVGB(%%mm2, %%mm0) | |
1804 PAVGB(%%mm3, %%mm1) | |
1805 "pand %%mm4, %%mm0 \n\t" | |
1806 "pand %%mm4, %%mm1 \n\t" | |
1807 "packuswb %%mm1, %%mm0 \n\t" | |
1808 "movq %%mm0, %%mm1 \n\t" | |
1809 "psrlw $8, %%mm0 \n\t" | |
1810 "pand %%mm4, %%mm1 \n\t" | |
1811 "packuswb %%mm0, %%mm0 \n\t" | |
1812 "packuswb %%mm1, %%mm1 \n\t" | |
1813 "movd %%mm0, (%4, %%"REG_a") \n\t" | |
1814 "movd %%mm1, (%3, %%"REG_a") \n\t" | |
1815 "add $4, %%"REG_a" \n\t" | |
1816 " js 1b \n\t" | |
1817 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width) | |
1818 : "%"REG_a | |
1819 ); | |
1820 #else | |
1821 int i; | |
1822 for(i=0; i<width; i++) | |
1823 { | |
1824 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1; | |
1825 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1; | |
1826 } | |
1827 #endif | |
1828 } | |
1829 | |
1830 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width) | |
1831 { | |
1832 int i; | |
1833 for(i=0; i<width; i++) | |
1834 { | |
1835 int b= ((uint32_t*)src)[i]&0xFF; | |
1836 int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
1837 int r= (((uint32_t*)src)[i]>>16)&0xFF; | |
1838 | |
1839 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); | |
1840 } | |
1841 } | |
1842 | |
1843 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1844 { | |
1845 int i; | |
1846 for(i=0; i<width; i++) | |
1847 { | |
1848 const int a= ((uint32_t*)src1)[2*i+0]; | |
1849 const int e= ((uint32_t*)src1)[2*i+1]; | |
1850 const int c= ((uint32_t*)src2)[2*i+0]; | |
1851 const int d= ((uint32_t*)src2)[2*i+1]; | |
1852 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF); | |
1853 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00); | |
1854 const int b= l&0x3FF; | |
1855 const int g= h>>8; | |
1856 const int r= l>>16; | |
1857 | |
1858 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1859 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1860 } | |
1861 } | |
1862 | |
1863 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width) | |
1864 { | |
1865 #ifdef HAVE_MMX | |
1866 asm volatile( | |
1867 "mov %2, %%"REG_a" \n\t" | |
1868 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" | |
1869 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
1870 "pxor %%mm7, %%mm7 \n\t" | |
19396 | 1871 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t" |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
1872 ASMALIGN(4) |
18861 | 1873 "1: \n\t" |
19396 | 1874 PREFETCH" 64(%0, %%"REG_d") \n\t" |
1875 "movd (%0, %%"REG_d"), %%mm0 \n\t" | |
1876 "movd 3(%0, %%"REG_d"), %%mm1 \n\t" | |
18861 | 1877 "punpcklbw %%mm7, %%mm0 \n\t" |
1878 "punpcklbw %%mm7, %%mm1 \n\t" | |
19396 | 1879 "movd 6(%0, %%"REG_d"), %%mm2 \n\t" |
1880 "movd 9(%0, %%"REG_d"), %%mm3 \n\t" | |
18861 | 1881 "punpcklbw %%mm7, %%mm2 \n\t" |
1882 "punpcklbw %%mm7, %%mm3 \n\t" | |
1883 "pmaddwd %%mm6, %%mm0 \n\t" | |
1884 "pmaddwd %%mm6, %%mm1 \n\t" | |
1885 "pmaddwd %%mm6, %%mm2 \n\t" | |
1886 "pmaddwd %%mm6, %%mm3 \n\t" | |
1887 #ifndef FAST_BGR2YV12 | |
1888 "psrad $8, %%mm0 \n\t" | |
1889 "psrad $8, %%mm1 \n\t" | |
1890 "psrad $8, %%mm2 \n\t" | |
1891 "psrad $8, %%mm3 \n\t" | |
1892 #endif | |
1893 "packssdw %%mm1, %%mm0 \n\t" | |
1894 "packssdw %%mm3, %%mm2 \n\t" | |
1895 "pmaddwd %%mm5, %%mm0 \n\t" | |
1896 "pmaddwd %%mm5, %%mm2 \n\t" | |
1897 "packssdw %%mm2, %%mm0 \n\t" | |
1898 "psraw $7, %%mm0 \n\t" | |
1899 | |
19396 | 1900 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" |
1901 "movd 15(%0, %%"REG_d"), %%mm1 \n\t" | |
18861 | 1902 "punpcklbw %%mm7, %%mm4 \n\t" |
1903 "punpcklbw %%mm7, %%mm1 \n\t" | |
19396 | 1904 "movd 18(%0, %%"REG_d"), %%mm2 \n\t" |
1905 "movd 21(%0, %%"REG_d"), %%mm3 \n\t" | |
18861 | 1906 "punpcklbw %%mm7, %%mm2 \n\t" |
1907 "punpcklbw %%mm7, %%mm3 \n\t" | |
1908 "pmaddwd %%mm6, %%mm4 \n\t" | |
1909 "pmaddwd %%mm6, %%mm1 \n\t" | |
1910 "pmaddwd %%mm6, %%mm2 \n\t" | |
1911 "pmaddwd %%mm6, %%mm3 \n\t" | |
1912 #ifndef FAST_BGR2YV12 | |
1913 "psrad $8, %%mm4 \n\t" | |
1914 "psrad $8, %%mm1 \n\t" | |
1915 "psrad $8, %%mm2 \n\t" | |
1916 "psrad $8, %%mm3 \n\t" | |
1917 #endif | |
1918 "packssdw %%mm1, %%mm4 \n\t" | |
1919 "packssdw %%mm3, %%mm2 \n\t" | |
1920 "pmaddwd %%mm5, %%mm4 \n\t" | |
1921 "pmaddwd %%mm5, %%mm2 \n\t" | |
19396 | 1922 "add $24, %%"REG_d" \n\t" |
18861 | 1923 "packssdw %%mm2, %%mm4 \n\t" |
1924 "psraw $7, %%mm4 \n\t" | |
1925 | |
1926 "packuswb %%mm4, %%mm0 \n\t" | |
1927 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" | |
1928 | |
1929 "movq %%mm0, (%1, %%"REG_a") \n\t" | |
1930 "add $8, %%"REG_a" \n\t" | |
1931 " js 1b \n\t" | |
1932 : : "r" (src+width*3), "r" (dst+width), "g" (-width) | |
19396 | 1933 : "%"REG_a, "%"REG_d |
18861 | 1934 ); |
1935 #else | |
1936 int i; | |
1937 for(i=0; i<width; i++) | |
1938 { | |
1939 int b= src[i*3+0]; | |
1940 int g= src[i*3+1]; | |
1941 int r= src[i*3+2]; | |
1942 | |
1943 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); | |
1944 } | |
1945 #endif | |
1946 } | |
1947 | |
1948 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) | |
1949 { | |
1950 #ifdef HAVE_MMX | |
1951 asm volatile( | |
1952 "mov %4, %%"REG_a" \n\t" | |
1953 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
1954 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" | |
1955 "pxor %%mm7, %%mm7 \n\t" | |
19396 | 1956 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" |
1957 "add %%"REG_d", %%"REG_d" \n\t" | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
1958 ASMALIGN(4) |
18861 | 1959 "1: \n\t" |
19396 | 1960 PREFETCH" 64(%0, %%"REG_d") \n\t" |
1961 PREFETCH" 64(%1, %%"REG_d") \n\t" | |
18861 | 1962 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
19396 | 1963 "movq (%0, %%"REG_d"), %%mm0 \n\t" |
1964 "movq (%1, %%"REG_d"), %%mm1 \n\t" | |
1965 "movq 6(%0, %%"REG_d"), %%mm2 \n\t" | |
1966 "movq 6(%1, %%"REG_d"), %%mm3 \n\t" | |
18861 | 1967 PAVGB(%%mm1, %%mm0) |
1968 PAVGB(%%mm3, %%mm2) | |
1969 "movq %%mm0, %%mm1 \n\t" | |
1970 "movq %%mm2, %%mm3 \n\t" | |
1971 "psrlq $24, %%mm0 \n\t" | |
1972 "psrlq $24, %%mm2 \n\t" | |
1973 PAVGB(%%mm1, %%mm0) | |
1974 PAVGB(%%mm3, %%mm2) | |
1975 "punpcklbw %%mm7, %%mm0 \n\t" | |
1976 "punpcklbw %%mm7, %%mm2 \n\t" | |
1977 #else | |
19396 | 1978 "movd (%0, %%"REG_d"), %%mm0 \n\t" |
1979 "movd (%1, %%"REG_d"), %%mm1 \n\t" | |
1980 "movd 3(%0, %%"REG_d"), %%mm2 \n\t" | |
1981 "movd 3(%1, %%"REG_d"), %%mm3 \n\t" | |
18861 | 1982 "punpcklbw %%mm7, %%mm0 \n\t" |
1983 "punpcklbw %%mm7, %%mm1 \n\t" | |
1984 "punpcklbw %%mm7, %%mm2 \n\t" | |
1985 "punpcklbw %%mm7, %%mm3 \n\t" | |
1986 "paddw %%mm1, %%mm0 \n\t" | |
1987 "paddw %%mm3, %%mm2 \n\t" | |
1988 "paddw %%mm2, %%mm0 \n\t" | |
19396 | 1989 "movd 6(%0, %%"REG_d"), %%mm4 \n\t" |
1990 "movd 6(%1, %%"REG_d"), %%mm1 \n\t" | |
1991 "movd 9(%0, %%"REG_d"), %%mm2 \n\t" | |
1992 "movd 9(%1, %%"REG_d"), %%mm3 \n\t" | |
18861 | 1993 "punpcklbw %%mm7, %%mm4 \n\t" |
1994 "punpcklbw %%mm7, %%mm1 \n\t" | |
1995 "punpcklbw %%mm7, %%mm2 \n\t" | |
1996 "punpcklbw %%mm7, %%mm3 \n\t" | |
1997 "paddw %%mm1, %%mm4 \n\t" | |
1998 "paddw %%mm3, %%mm2 \n\t" | |
1999 "paddw %%mm4, %%mm2 \n\t" | |
2000 "psrlw $2, %%mm0 \n\t" | |
2001 "psrlw $2, %%mm2 \n\t" | |
2002 #endif | |
2003 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" | |
2004 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
2005 | |
2006 "pmaddwd %%mm0, %%mm1 \n\t" | |
2007 "pmaddwd %%mm2, %%mm3 \n\t" | |
2008 "pmaddwd %%mm6, %%mm0 \n\t" | |
2009 "pmaddwd %%mm6, %%mm2 \n\t" | |
2010 #ifndef FAST_BGR2YV12 | |
2011 "psrad $8, %%mm0 \n\t" | |
2012 "psrad $8, %%mm1 \n\t" | |
2013 "psrad $8, %%mm2 \n\t" | |
2014 "psrad $8, %%mm3 \n\t" | |
2015 #endif | |
2016 "packssdw %%mm2, %%mm0 \n\t" | |
2017 "packssdw %%mm3, %%mm1 \n\t" | |
2018 "pmaddwd %%mm5, %%mm0 \n\t" | |
2019 "pmaddwd %%mm5, %%mm1 \n\t" | |
2020 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
2021 "psraw $7, %%mm0 \n\t" | |
2022 | |
2023 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
19396 | 2024 "movq 12(%0, %%"REG_d"), %%mm4 \n\t" |
2025 "movq 12(%1, %%"REG_d"), %%mm1 \n\t" | |
2026 "movq 18(%0, %%"REG_d"), %%mm2 \n\t" | |
2027 "movq 18(%1, %%"REG_d"), %%mm3 \n\t" | |
18861 | 2028 PAVGB(%%mm1, %%mm4) |
2029 PAVGB(%%mm3, %%mm2) | |
2030 "movq %%mm4, %%mm1 \n\t" | |
2031 "movq %%mm2, %%mm3 \n\t" | |
2032 "psrlq $24, %%mm4 \n\t" | |
2033 "psrlq $24, %%mm2 \n\t" | |
2034 PAVGB(%%mm1, %%mm4) | |
2035 PAVGB(%%mm3, %%mm2) | |
2036 "punpcklbw %%mm7, %%mm4 \n\t" | |
2037 "punpcklbw %%mm7, %%mm2 \n\t" | |
2038 #else | |
19396 | 2039 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" |
2040 "movd 12(%1, %%"REG_d"), %%mm1 \n\t" | |
2041 "movd 15(%0, %%"REG_d"), %%mm2 \n\t" | |
2042 "movd 15(%1, %%"REG_d"), %%mm3 \n\t" | |
18861 | 2043 "punpcklbw %%mm7, %%mm4 \n\t" |
2044 "punpcklbw %%mm7, %%mm1 \n\t" | |
2045 "punpcklbw %%mm7, %%mm2 \n\t" | |
2046 "punpcklbw %%mm7, %%mm3 \n\t" | |
2047 "paddw %%mm1, %%mm4 \n\t" | |
2048 "paddw %%mm3, %%mm2 \n\t" | |
2049 "paddw %%mm2, %%mm4 \n\t" | |
19396 | 2050 "movd 18(%0, %%"REG_d"), %%mm5 \n\t" |
2051 "movd 18(%1, %%"REG_d"), %%mm1 \n\t" | |
2052 "movd 21(%0, %%"REG_d"), %%mm2 \n\t" | |
2053 "movd 21(%1, %%"REG_d"), %%mm3 \n\t" | |
18861 | 2054 "punpcklbw %%mm7, %%mm5 \n\t" |
2055 "punpcklbw %%mm7, %%mm1 \n\t" | |
2056 "punpcklbw %%mm7, %%mm2 \n\t" | |
2057 "punpcklbw %%mm7, %%mm3 \n\t" | |
2058 "paddw %%mm1, %%mm5 \n\t" | |
2059 "paddw %%mm3, %%mm2 \n\t" | |
2060 "paddw %%mm5, %%mm2 \n\t" | |
2061 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
2062 "psrlw $2, %%mm4 \n\t" | |
2063 "psrlw $2, %%mm2 \n\t" | |
2064 #endif | |
2065 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" | |
2066 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
2067 | |
2068 "pmaddwd %%mm4, %%mm1 \n\t" | |
2069 "pmaddwd %%mm2, %%mm3 \n\t" | |
2070 "pmaddwd %%mm6, %%mm4 \n\t" | |
2071 "pmaddwd %%mm6, %%mm2 \n\t" | |
2072 #ifndef FAST_BGR2YV12 | |
2073 "psrad $8, %%mm4 \n\t" | |
2074 "psrad $8, %%mm1 \n\t" | |
2075 "psrad $8, %%mm2 \n\t" | |
2076 "psrad $8, %%mm3 \n\t" | |
2077 #endif | |
2078 "packssdw %%mm2, %%mm4 \n\t" | |
2079 "packssdw %%mm3, %%mm1 \n\t" | |
2080 "pmaddwd %%mm5, %%mm4 \n\t" | |
2081 "pmaddwd %%mm5, %%mm1 \n\t" | |
19396 | 2082 "add $24, %%"REG_d" \n\t" |
18861 | 2083 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 |
2084 "psraw $7, %%mm4 \n\t" | |
2085 | |
2086 "movq %%mm0, %%mm1 \n\t" | |
2087 "punpckldq %%mm4, %%mm0 \n\t" | |
2088 "punpckhdq %%mm4, %%mm1 \n\t" | |
2089 "packsswb %%mm1, %%mm0 \n\t" | |
2090 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" | |
2091 | |
2092 "movd %%mm0, (%2, %%"REG_a") \n\t" | |
2093 "punpckhdq %%mm0, %%mm0 \n\t" | |
2094 "movd %%mm0, (%3, %%"REG_a") \n\t" | |
2095 "add $4, %%"REG_a" \n\t" | |
2096 " js 1b \n\t" | |
2097 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width) | |
19396 | 2098 : "%"REG_a, "%"REG_d |
18861 | 2099 ); |
2100 #else | |
2101 int i; | |
2102 for(i=0; i<width; i++) | |
2103 { | |
2104 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3]; | |
2105 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4]; | |
2106 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5]; | |
2107 | |
2108 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2109 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2110 } | |
2111 #endif | |
2112 } | |
2113 | |
2114 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width) | |
2115 { | |
2116 int i; | |
2117 for(i=0; i<width; i++) | |
2118 { | |
2119 int d= ((uint16_t*)src)[i]; | |
2120 int b= d&0x1F; | |
2121 int g= (d>>5)&0x3F; | |
2122 int r= (d>>11)&0x1F; | |
2123 | |
2124 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16; | |
2125 } | |
2126 } | |
2127 | |
2128 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2129 { | |
2130 int i; | |
2131 for(i=0; i<width; i++) | |
2132 { | |
2133 int d0= ((uint32_t*)src1)[i]; | |
2134 int d1= ((uint32_t*)src2)[i]; | |
2135 | |
2136 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F); | |
2137 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F); | |
2138 | |
2139 int dh2= (dh>>11) + (dh<<21); | |
2140 int d= dh2 + dl; | |
2141 | |
2142 int b= d&0x7F; | |
2143 int r= (d>>11)&0x7F; | |
2144 int g= d>>21; | |
2145 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128; | |
2146 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128; | |
2147 } | |
2148 } | |
2149 | |
2150 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width) | |
2151 { | |
2152 int i; | |
2153 for(i=0; i<width; i++) | |
2154 { | |
2155 int d= ((uint16_t*)src)[i]; | |
2156 int b= d&0x1F; | |
2157 int g= (d>>5)&0x1F; | |
2158 int r= (d>>10)&0x1F; | |
2159 | |
2160 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16; | |
2161 } | |
2162 } | |
2163 | |
2164 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2165 { | |
2166 int i; | |
2167 for(i=0; i<width; i++) | |
2168 { | |
2169 int d0= ((uint32_t*)src1)[i]; | |
2170 int d1= ((uint32_t*)src2)[i]; | |
2171 | |
2172 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F); | |
2173 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F); | |
2174 | |
2175 int dh2= (dh>>11) + (dh<<21); | |
2176 int d= dh2 + dl; | |
2177 | |
2178 int b= d&0x7F; | |
2179 int r= (d>>10)&0x7F; | |
2180 int g= d>>21; | |
2181 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128; | |
2182 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128; | |
2183 } | |
2184 } | |
2185 | |
2186 | |
2187 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width) | |
2188 { | |
2189 int i; | |
2190 for(i=0; i<width; i++) | |
2191 { | |
2192 int r= ((uint32_t*)src)[i]&0xFF; | |
2193 int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
2194 int b= (((uint32_t*)src)[i]>>16)&0xFF; | |
2195 | |
2196 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); | |
2197 } | |
2198 } | |
2199 | |
2200 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2201 { | |
2202 int i; | |
2203 for(i=0; i<width; i++) | |
2204 { | |
2205 const int a= ((uint32_t*)src1)[2*i+0]; | |
2206 const int e= ((uint32_t*)src1)[2*i+1]; | |
2207 const int c= ((uint32_t*)src2)[2*i+0]; | |
2208 const int d= ((uint32_t*)src2)[2*i+1]; | |
2209 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF); | |
2210 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00); | |
2211 const int r= l&0x3FF; | |
2212 const int g= h>>8; | |
2213 const int b= l>>16; | |
2214 | |
2215 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2216 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2217 } | |
2218 } | |
2219 | |
2220 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width) | |
2221 { | |
2222 int i; | |
2223 for(i=0; i<width; i++) | |
2224 { | |
2225 int r= src[i*3+0]; | |
2226 int g= src[i*3+1]; | |
2227 int b= src[i*3+2]; | |
2228 | |
2229 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); | |
2230 } | |
2231 } | |
2232 | |
2233 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2234 { | |
2235 int i; | |
2236 for(i=0; i<width; i++) | |
2237 { | |
2238 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3]; | |
2239 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4]; | |
2240 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5]; | |
2241 | |
2242 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2243 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2244 } | |
2245 } | |
2246 | |
2247 | |
2248 // Bilinear / Bicubic scaling | |
2249 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, | |
2250 int16_t *filter, int16_t *filterPos, long filterSize) | |
2251 { | |
2252 #ifdef HAVE_MMX | |
2253 assert(filterSize % 4 == 0 && filterSize>0); | |
2254 if(filterSize==4) // allways true for upscaling, sometimes for down too | |
2255 { | |
2256 long counter= -2*dstW; | |
2257 filter-= counter*2; | |
2258 filterPos-= counter/2; | |
2259 dst-= counter/2; | |
2260 asm volatile( | |
19396 | 2261 #if defined(PIC) |
2262 "push %%"REG_b" \n\t" | |
2263 #endif | |
18861 | 2264 "pxor %%mm7, %%mm7 \n\t" |
2265 "movq "MANGLE(w02)", %%mm6 \n\t" | |
2266 "push %%"REG_BP" \n\t" // we use 7 regs here ... | |
2267 "mov %%"REG_a", %%"REG_BP" \n\t" | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
2268 ASMALIGN(4) |
18861 | 2269 "1: \n\t" |
2270 "movzwl (%2, %%"REG_BP"), %%eax \n\t" | |
2271 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t" | |
2272 "movq (%1, %%"REG_BP", 4), %%mm1\n\t" | |
2273 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t" | |
2274 "movd (%3, %%"REG_a"), %%mm0 \n\t" | |
2275 "movd (%3, %%"REG_b"), %%mm2 \n\t" | |
2276 "punpcklbw %%mm7, %%mm0 \n\t" | |
2277 "punpcklbw %%mm7, %%mm2 \n\t" | |
2278 "pmaddwd %%mm1, %%mm0 \n\t" | |
2279 "pmaddwd %%mm2, %%mm3 \n\t" | |
2280 "psrad $8, %%mm0 \n\t" | |
2281 "psrad $8, %%mm3 \n\t" | |
2282 "packssdw %%mm3, %%mm0 \n\t" | |
2283 "pmaddwd %%mm6, %%mm0 \n\t" | |
2284 "packssdw %%mm0, %%mm0 \n\t" | |
2285 "movd %%mm0, (%4, %%"REG_BP") \n\t" | |
2286 "add $4, %%"REG_BP" \n\t" | |
2287 " jnc 1b \n\t" | |
2288 | |
2289 "pop %%"REG_BP" \n\t" | |
19396 | 2290 #if defined(PIC) |
2291 "pop %%"REG_b" \n\t" | |
2292 #endif | |
18861 | 2293 : "+a" (counter) |
2294 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
19396 | 2295 #if !defined(PIC) |
18861 | 2296 : "%"REG_b |
19396 | 2297 #endif |
18861 | 2298 ); |
2299 } | |
2300 else if(filterSize==8) | |
2301 { | |
2302 long counter= -2*dstW; | |
2303 filter-= counter*4; | |
2304 filterPos-= counter/2; | |
2305 dst-= counter/2; | |
2306 asm volatile( | |
19396 | 2307 #if defined(PIC) |
2308 "push %%"REG_b" \n\t" | |
2309 #endif | |
18861 | 2310 "pxor %%mm7, %%mm7 \n\t" |
2311 "movq "MANGLE(w02)", %%mm6 \n\t" | |
2312 "push %%"REG_BP" \n\t" // we use 7 regs here ... | |
2313 "mov %%"REG_a", %%"REG_BP" \n\t" | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
2314 ASMALIGN(4) |
18861 | 2315 "1: \n\t" |
2316 "movzwl (%2, %%"REG_BP"), %%eax \n\t" | |
2317 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t" | |
2318 "movq (%1, %%"REG_BP", 8), %%mm1\n\t" | |
2319 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t" | |
2320 "movd (%3, %%"REG_a"), %%mm0 \n\t" | |
2321 "movd (%3, %%"REG_b"), %%mm2 \n\t" | |
2322 "punpcklbw %%mm7, %%mm0 \n\t" | |
2323 "punpcklbw %%mm7, %%mm2 \n\t" | |
2324 "pmaddwd %%mm1, %%mm0 \n\t" | |
2325 "pmaddwd %%mm2, %%mm3 \n\t" | |
2326 | |
2327 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t" | |
2328 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t" | |
2329 "movd 4(%3, %%"REG_a"), %%mm4 \n\t" | |
2330 "movd 4(%3, %%"REG_b"), %%mm2 \n\t" | |
2331 "punpcklbw %%mm7, %%mm4 \n\t" | |
2332 "punpcklbw %%mm7, %%mm2 \n\t" | |
2333 "pmaddwd %%mm1, %%mm4 \n\t" | |
2334 "pmaddwd %%mm2, %%mm5 \n\t" | |
2335 "paddd %%mm4, %%mm0 \n\t" | |
2336 "paddd %%mm5, %%mm3 \n\t" | |
2337 | |
2338 "psrad $8, %%mm0 \n\t" | |
2339 "psrad $8, %%mm3 \n\t" | |
2340 "packssdw %%mm3, %%mm0 \n\t" | |
2341 "pmaddwd %%mm6, %%mm0 \n\t" | |
2342 "packssdw %%mm0, %%mm0 \n\t" | |
2343 "movd %%mm0, (%4, %%"REG_BP") \n\t" | |
2344 "add $4, %%"REG_BP" \n\t" | |
2345 " jnc 1b \n\t" | |
2346 | |
2347 "pop %%"REG_BP" \n\t" | |
19396 | 2348 #if defined(PIC) |
2349 "pop %%"REG_b" \n\t" | |
2350 #endif | |
18861 | 2351 : "+a" (counter) |
2352 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
19396 | 2353 #if !defined(PIC) |
18861 | 2354 : "%"REG_b |
19396 | 2355 #endif |
18861 | 2356 ); |
2357 } | |
2358 else | |
2359 { | |
2360 uint8_t *offset = src+filterSize; | |
2361 long counter= -2*dstW; | |
2362 // filter-= counter*filterSize/2; | |
2363 filterPos-= counter/2; | |
2364 dst-= counter/2; | |
2365 asm volatile( | |
2366 "pxor %%mm7, %%mm7 \n\t" | |
2367 "movq "MANGLE(w02)", %%mm6 \n\t" | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
2368 ASMALIGN(4) |
18861 | 2369 "1: \n\t" |
2370 "mov %2, %%"REG_c" \n\t" | |
2371 "movzwl (%%"REG_c", %0), %%eax \n\t" | |
19396 | 2372 "movzwl 2(%%"REG_c", %0), %%edx \n\t" |
18861 | 2373 "mov %5, %%"REG_c" \n\t" |
2374 "pxor %%mm4, %%mm4 \n\t" | |
2375 "pxor %%mm5, %%mm5 \n\t" | |
2376 "2: \n\t" | |
2377 "movq (%1), %%mm1 \n\t" | |
2378 "movq (%1, %6), %%mm3 \n\t" | |
2379 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t" | |
19396 | 2380 "movd (%%"REG_c", %%"REG_d"), %%mm2\n\t" |
18861 | 2381 "punpcklbw %%mm7, %%mm0 \n\t" |
2382 "punpcklbw %%mm7, %%mm2 \n\t" | |
2383 "pmaddwd %%mm1, %%mm0 \n\t" | |
2384 "pmaddwd %%mm2, %%mm3 \n\t" | |
2385 "paddd %%mm3, %%mm5 \n\t" | |
2386 "paddd %%mm0, %%mm4 \n\t" | |
2387 "add $8, %1 \n\t" | |
2388 "add $4, %%"REG_c" \n\t" | |
2389 "cmp %4, %%"REG_c" \n\t" | |
2390 " jb 2b \n\t" | |
2391 "add %6, %1 \n\t" | |
2392 "psrad $8, %%mm4 \n\t" | |
2393 "psrad $8, %%mm5 \n\t" | |
2394 "packssdw %%mm5, %%mm4 \n\t" | |
2395 "pmaddwd %%mm6, %%mm4 \n\t" | |
2396 "packssdw %%mm4, %%mm4 \n\t" | |
2397 "mov %3, %%"REG_a" \n\t" | |
2398 "movd %%mm4, (%%"REG_a", %0) \n\t" | |
2399 "add $4, %0 \n\t" | |
2400 " jnc 1b \n\t" | |
2401 | |
2402 : "+r" (counter), "+r" (filter) | |
2403 : "m" (filterPos), "m" (dst), "m"(offset), | |
2404 "m" (src), "r" (filterSize*2) | |
19396 | 2405 : "%"REG_a, "%"REG_c, "%"REG_d |
18861 | 2406 ); |
2407 } | |
2408 #else | |
2409 #ifdef HAVE_ALTIVEC | |
2410 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize); | |
2411 #else | |
2412 int i; | |
2413 for(i=0; i<dstW; i++) | |
2414 { | |
2415 int j; | |
2416 int srcPos= filterPos[i]; | |
2417 int val=0; | |
2418 // printf("filterPos: %d\n", filterPos[i]); | |
2419 for(j=0; j<filterSize; j++) | |
2420 { | |
2421 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); | |
2422 val += ((int)src[srcPos + j])*filter[filterSize*i + j]; | |
2423 } | |
2424 // filter += hFilterSize; | |
19181 | 2425 dst[i] = FFMIN(FFMAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ... |
18861 | 2426 // dst[i] = val>>7; |
2427 } | |
2428 #endif | |
2429 #endif | |
2430 } | |
2431 // *** horizontal scale Y line to temp buffer | |
2432 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc, | |
2433 int flags, int canMMX2BeUsed, int16_t *hLumFilter, | |
2434 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, | |
2435 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, | |
2436 int32_t *mmx2FilterPos) | |
2437 { | |
2438 if(srcFormat==IMGFMT_YUY2) | |
2439 { | |
2440 RENAME(yuy2ToY)(formatConvBuffer, src, srcW); | |
2441 src= formatConvBuffer; | |
2442 } | |
2443 else if(srcFormat==IMGFMT_UYVY) | |
2444 { | |
2445 RENAME(uyvyToY)(formatConvBuffer, src, srcW); | |
2446 src= formatConvBuffer; | |
2447 } | |
2448 else if(srcFormat==IMGFMT_BGR32) | |
2449 { | |
2450 RENAME(bgr32ToY)(formatConvBuffer, src, srcW); | |
2451 src= formatConvBuffer; | |
2452 } | |
2453 else if(srcFormat==IMGFMT_BGR24) | |
2454 { | |
2455 RENAME(bgr24ToY)(formatConvBuffer, src, srcW); | |
2456 src= formatConvBuffer; | |
2457 } | |
2458 else if(srcFormat==IMGFMT_BGR16) | |
2459 { | |
2460 RENAME(bgr16ToY)(formatConvBuffer, src, srcW); | |
2461 src= formatConvBuffer; | |
2462 } | |
2463 else if(srcFormat==IMGFMT_BGR15) | |
2464 { | |
2465 RENAME(bgr15ToY)(formatConvBuffer, src, srcW); | |
2466 src= formatConvBuffer; | |
2467 } | |
2468 else if(srcFormat==IMGFMT_RGB32) | |
2469 { | |
2470 RENAME(rgb32ToY)(formatConvBuffer, src, srcW); | |
2471 src= formatConvBuffer; | |
2472 } | |
2473 else if(srcFormat==IMGFMT_RGB24) | |
2474 { | |
2475 RENAME(rgb24ToY)(formatConvBuffer, src, srcW); | |
2476 src= formatConvBuffer; | |
2477 } | |
2478 | |
2479 #ifdef HAVE_MMX | |
2480 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one) | |
2481 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) | |
2482 #else | |
2483 if(!(flags&SWS_FAST_BILINEAR)) | |
2484 #endif | |
2485 { | |
2486 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); | |
2487 } | |
2488 else // Fast Bilinear upscale / crap downscale | |
2489 { | |
2490 #if defined(ARCH_X86) || defined(ARCH_X86_64) | |
2491 #ifdef HAVE_MMX2 | |
2492 int i; | |
19396 | 2493 #if defined(PIC) |
2494 uint64_t ebxsave __attribute__((aligned(8))); | |
2495 #endif | |
18861 | 2496 if(canMMX2BeUsed) |
2497 { | |
2498 asm volatile( | |
19396 | 2499 #if defined(PIC) |
2500 "mov %%"REG_b", %5 \n\t" | |
2501 #endif | |
18861 | 2502 "pxor %%mm7, %%mm7 \n\t" |
2503 "mov %0, %%"REG_c" \n\t" | |
2504 "mov %1, %%"REG_D" \n\t" | |
2505 "mov %2, %%"REG_d" \n\t" | |
2506 "mov %3, %%"REG_b" \n\t" | |
2507 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2508 PREFETCH" (%%"REG_c") \n\t" | |
2509 PREFETCH" 32(%%"REG_c") \n\t" | |
2510 PREFETCH" 64(%%"REG_c") \n\t" | |
2511 | |
2512 #ifdef ARCH_X86_64 | |
2513 | |
2514 #define FUNNY_Y_CODE \ | |
2515 "movl (%%"REG_b"), %%esi \n\t"\ | |
2516 "call *%4 \n\t"\ | |
2517 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\ | |
2518 "add %%"REG_S", %%"REG_c" \n\t"\ | |
2519 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2520 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2521 | |
2522 #else | |
2523 | |
2524 #define FUNNY_Y_CODE \ | |
2525 "movl (%%"REG_b"), %%esi \n\t"\ | |
2526 "call *%4 \n\t"\ | |
2527 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\ | |
2528 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2529 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2530 | |
2531 #endif | |
2532 | |
2533 FUNNY_Y_CODE | |
2534 FUNNY_Y_CODE | |
2535 FUNNY_Y_CODE | |
2536 FUNNY_Y_CODE | |
2537 FUNNY_Y_CODE | |
2538 FUNNY_Y_CODE | |
2539 FUNNY_Y_CODE | |
2540 FUNNY_Y_CODE | |
2541 | |
19396 | 2542 #if defined(PIC) |
2543 "mov %5, %%"REG_b" \n\t" | |
2544 #endif | |
18861 | 2545 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2546 "m" (funnyYCode) | |
19396 | 2547 #if defined(PIC) |
2548 ,"m" (ebxsave) | |
2549 #endif | |
2550 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D | |
2551 #if !defined(PIC) | |
2552 ,"%"REG_b | |
2553 #endif | |
18861 | 2554 ); |
2555 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; | |
2556 } | |
2557 else | |
2558 { | |
2559 #endif | |
2560 long xInc_shr16 = xInc >> 16; | |
2561 uint16_t xInc_mask = xInc & 0xffff; | |
2562 //NO MMX just normal asm ... | |
2563 asm volatile( | |
2564 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
19396 | 2565 "xor %%"REG_d", %%"REG_d" \n\t" // xx |
18861 | 2566 "xorl %%ecx, %%ecx \n\t" // 2*xalpha |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
2567 ASMALIGN(4) |
18861 | 2568 "1: \n\t" |
19396 | 2569 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] |
2570 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
18861 | 2571 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2572 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2573 "shll $16, %%edi \n\t" | |
2574 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2575 "mov %1, %%"REG_D" \n\t" | |
2576 "shrl $9, %%esi \n\t" | |
2577 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t" | |
2578 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
19396 | 2579 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry |
18861 | 2580 |
19396 | 2581 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] |
2582 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
18861 | 2583 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2584 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2585 "shll $16, %%edi \n\t" | |
2586 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2587 "mov %1, %%"REG_D" \n\t" | |
2588 "shrl $9, %%esi \n\t" | |
2589 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t" | |
2590 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
19396 | 2591 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry |
18861 | 2592 |
2593 | |
2594 "add $2, %%"REG_a" \n\t" | |
2595 "cmp %2, %%"REG_a" \n\t" | |
2596 " jb 1b \n\t" | |
2597 | |
2598 | |
2599 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask) | |
19396 | 2600 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" |
18861 | 2601 ); |
2602 #ifdef HAVE_MMX2 | |
2603 } //if MMX2 can't be used | |
2604 #endif | |
2605 #else | |
2606 int i; | |
2607 unsigned int xpos=0; | |
2608 for(i=0;i<dstWidth;i++) | |
2609 { | |
2610 register unsigned int xx=xpos>>16; | |
2611 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2612 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; | |
2613 xpos+=xInc; | |
2614 } | |
2615 #endif | |
2616 } | |
2617 } | |
2618 | |
2619 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2, | |
2620 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, | |
2621 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, | |
2622 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, | |
2623 int32_t *mmx2FilterPos) | |
2624 { | |
2625 if(srcFormat==IMGFMT_YUY2) | |
2626 { | |
2627 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2628 src1= formatConvBuffer; | |
2629 src2= formatConvBuffer+2048; | |
2630 } | |
2631 else if(srcFormat==IMGFMT_UYVY) | |
2632 { | |
2633 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2634 src1= formatConvBuffer; | |
2635 src2= formatConvBuffer+2048; | |
2636 } | |
2637 else if(srcFormat==IMGFMT_BGR32) | |
2638 { | |
2639 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2640 src1= formatConvBuffer; | |
2641 src2= formatConvBuffer+2048; | |
2642 } | |
2643 else if(srcFormat==IMGFMT_BGR24) | |
2644 { | |
2645 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2646 src1= formatConvBuffer; | |
2647 src2= formatConvBuffer+2048; | |
2648 } | |
2649 else if(srcFormat==IMGFMT_BGR16) | |
2650 { | |
2651 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2652 src1= formatConvBuffer; | |
2653 src2= formatConvBuffer+2048; | |
2654 } | |
2655 else if(srcFormat==IMGFMT_BGR15) | |
2656 { | |
2657 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2658 src1= formatConvBuffer; | |
2659 src2= formatConvBuffer+2048; | |
2660 } | |
2661 else if(srcFormat==IMGFMT_RGB32) | |
2662 { | |
2663 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2664 src1= formatConvBuffer; | |
2665 src2= formatConvBuffer+2048; | |
2666 } | |
2667 else if(srcFormat==IMGFMT_RGB24) | |
2668 { | |
2669 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2670 src1= formatConvBuffer; | |
2671 src2= formatConvBuffer+2048; | |
2672 } | |
2673 else if(isGray(srcFormat)) | |
2674 { | |
2675 return; | |
2676 } | |
2677 | |
2678 #ifdef HAVE_MMX | |
2679 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one) | |
2680 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) | |
2681 #else | |
2682 if(!(flags&SWS_FAST_BILINEAR)) | |
2683 #endif | |
2684 { | |
2685 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2686 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2687 } | |
2688 else // Fast Bilinear upscale / crap downscale | |
2689 { | |
2690 #if defined(ARCH_X86) || defined(ARCH_X86_64) | |
2691 #ifdef HAVE_MMX2 | |
2692 int i; | |
19396 | 2693 #if defined(PIC) |
2694 uint64_t ebxsave __attribute__((aligned(8))); | |
2695 #endif | |
18861 | 2696 if(canMMX2BeUsed) |
2697 { | |
2698 asm volatile( | |
19396 | 2699 #if defined(PIC) |
2700 "mov %%"REG_b", %6 \n\t" | |
2701 #endif | |
18861 | 2702 "pxor %%mm7, %%mm7 \n\t" |
2703 "mov %0, %%"REG_c" \n\t" | |
2704 "mov %1, %%"REG_D" \n\t" | |
2705 "mov %2, %%"REG_d" \n\t" | |
2706 "mov %3, %%"REG_b" \n\t" | |
2707 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2708 PREFETCH" (%%"REG_c") \n\t" | |
2709 PREFETCH" 32(%%"REG_c") \n\t" | |
2710 PREFETCH" 64(%%"REG_c") \n\t" | |
2711 | |
2712 #ifdef ARCH_X86_64 | |
2713 | |
2714 #define FUNNY_UV_CODE \ | |
2715 "movl (%%"REG_b"), %%esi \n\t"\ | |
2716 "call *%4 \n\t"\ | |
2717 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\ | |
2718 "add %%"REG_S", %%"REG_c" \n\t"\ | |
2719 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2720 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2721 | |
2722 #else | |
2723 | |
2724 #define FUNNY_UV_CODE \ | |
2725 "movl (%%"REG_b"), %%esi \n\t"\ | |
2726 "call *%4 \n\t"\ | |
2727 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\ | |
2728 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2729 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2730 | |
2731 #endif | |
2732 | |
2733 FUNNY_UV_CODE | |
2734 FUNNY_UV_CODE | |
2735 FUNNY_UV_CODE | |
2736 FUNNY_UV_CODE | |
2737 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2738 "mov %5, %%"REG_c" \n\t" // src | |
2739 "mov %1, %%"REG_D" \n\t" // buf1 | |
2740 "add $4096, %%"REG_D" \n\t" | |
2741 PREFETCH" (%%"REG_c") \n\t" | |
2742 PREFETCH" 32(%%"REG_c") \n\t" | |
2743 PREFETCH" 64(%%"REG_c") \n\t" | |
2744 | |
2745 FUNNY_UV_CODE | |
2746 FUNNY_UV_CODE | |
2747 FUNNY_UV_CODE | |
2748 FUNNY_UV_CODE | |
2749 | |
19396 | 2750 #if defined(PIC) |
2751 "mov %6, %%"REG_b" \n\t" | |
2752 #endif | |
18861 | 2753 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2754 "m" (funnyUVCode), "m" (src2) | |
19396 | 2755 #if defined(PIC) |
2756 ,"m" (ebxsave) | |
2757 #endif | |
19400
0310c3310360
Fix compilation with -no-PIC and without -fomit-frame-pointer (used by
uau
parents:
19396
diff
changeset
|
2758 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D |
19396 | 2759 #if !defined(PIC) |
2760 ,"%"REG_b | |
2761 #endif | |
18861 | 2762 ); |
2763 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) | |
2764 { | |
2765 // printf("%d %d %d\n", dstWidth, i, srcW); | |
2766 dst[i] = src1[srcW-1]*128; | |
2767 dst[i+2048] = src2[srcW-1]*128; | |
2768 } | |
2769 } | |
2770 else | |
2771 { | |
2772 #endif | |
2773 long xInc_shr16 = (long) (xInc >> 16); | |
2774 uint16_t xInc_mask = xInc & 0xffff; | |
2775 asm volatile( | |
2776 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
19396 | 2777 "xor %%"REG_d", %%"REG_d" \n\t" // xx |
18861 | 2778 "xorl %%ecx, %%ecx \n\t" // 2*xalpha |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
2779 ASMALIGN(4) |
18861 | 2780 "1: \n\t" |
2781 "mov %0, %%"REG_S" \n\t" | |
19396 | 2782 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx] |
2783 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1] | |
18861 | 2784 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2785 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2786 "shll $16, %%edi \n\t" | |
2787 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2788 "mov %1, %%"REG_D" \n\t" | |
2789 "shrl $9, %%esi \n\t" | |
2790 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t" | |
2791 | |
19396 | 2792 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx] |
2793 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
18861 | 2794 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2795 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2796 "shll $16, %%edi \n\t" | |
2797 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2798 "mov %1, %%"REG_D" \n\t" | |
2799 "shrl $9, %%esi \n\t" | |
2800 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t" | |
2801 | |
2802 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
19396 | 2803 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry |
18861 | 2804 "add $1, %%"REG_a" \n\t" |
2805 "cmp %2, %%"REG_a" \n\t" | |
2806 " jb 1b \n\t" | |
2807 | |
2808 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, | |
2809 which is needed to support GCC-4.0 */ | |
2810 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4)) | |
2811 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), | |
2812 #else | |
2813 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), | |
2814 #endif | |
2815 "r" (src2) | |
19396 | 2816 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" |
18861 | 2817 ); |
2818 #ifdef HAVE_MMX2 | |
2819 } //if MMX2 can't be used | |
2820 #endif | |
2821 #else | |
2822 int i; | |
2823 unsigned int xpos=0; | |
2824 for(i=0;i<dstWidth;i++) | |
2825 { | |
2826 register unsigned int xx=xpos>>16; | |
2827 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2828 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |
2829 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | |
2830 /* slower | |
2831 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; | |
2832 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; | |
2833 */ | |
2834 xpos+=xInc; | |
2835 } | |
2836 #endif | |
2837 } | |
2838 } | |
2839 | |
2840 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | |
2841 int srcSliceH, uint8_t* dst[], int dstStride[]){ | |
2842 | |
2843 /* load a few things into local vars to make the code more readable? and faster */ | |
2844 const int srcW= c->srcW; | |
2845 const int dstW= c->dstW; | |
2846 const int dstH= c->dstH; | |
2847 const int chrDstW= c->chrDstW; | |
2848 const int chrSrcW= c->chrSrcW; | |
2849 const int lumXInc= c->lumXInc; | |
2850 const int chrXInc= c->chrXInc; | |
2851 const int dstFormat= c->dstFormat; | |
2852 const int srcFormat= c->srcFormat; | |
2853 const int flags= c->flags; | |
2854 const int canMMX2BeUsed= c->canMMX2BeUsed; | |
2855 int16_t *vLumFilterPos= c->vLumFilterPos; | |
2856 int16_t *vChrFilterPos= c->vChrFilterPos; | |
2857 int16_t *hLumFilterPos= c->hLumFilterPos; | |
2858 int16_t *hChrFilterPos= c->hChrFilterPos; | |
2859 int16_t *vLumFilter= c->vLumFilter; | |
2860 int16_t *vChrFilter= c->vChrFilter; | |
2861 int16_t *hLumFilter= c->hLumFilter; | |
2862 int16_t *hChrFilter= c->hChrFilter; | |
2863 int32_t *lumMmxFilter= c->lumMmxFilter; | |
2864 int32_t *chrMmxFilter= c->chrMmxFilter; | |
2865 const int vLumFilterSize= c->vLumFilterSize; | |
2866 const int vChrFilterSize= c->vChrFilterSize; | |
2867 const int hLumFilterSize= c->hLumFilterSize; | |
2868 const int hChrFilterSize= c->hChrFilterSize; | |
2869 int16_t **lumPixBuf= c->lumPixBuf; | |
2870 int16_t **chrPixBuf= c->chrPixBuf; | |
2871 const int vLumBufSize= c->vLumBufSize; | |
2872 const int vChrBufSize= c->vChrBufSize; | |
2873 uint8_t *funnyYCode= c->funnyYCode; | |
2874 uint8_t *funnyUVCode= c->funnyUVCode; | |
2875 uint8_t *formatConvBuffer= c->formatConvBuffer; | |
2876 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample; | |
2877 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample); | |
2878 int lastDstY; | |
2879 | |
2880 /* vars whch will change and which we need to storw back in the context */ | |
2881 int dstY= c->dstY; | |
2882 int lumBufIndex= c->lumBufIndex; | |
2883 int chrBufIndex= c->chrBufIndex; | |
2884 int lastInLumBuf= c->lastInLumBuf; | |
2885 int lastInChrBuf= c->lastInChrBuf; | |
2886 | |
2887 if(isPacked(c->srcFormat)){ | |
2888 src[0]= | |
2889 src[1]= | |
2890 src[2]= src[0]; | |
2891 srcStride[0]= | |
2892 srcStride[1]= | |
2893 srcStride[2]= srcStride[0]; | |
2894 } | |
2895 srcStride[1]<<= c->vChrDrop; | |
2896 srcStride[2]<<= c->vChrDrop; | |
2897 | |
2898 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2], | |
2899 // (int)dst[0], (int)dst[1], (int)dst[2]); | |
2900 | |
2901 #if 0 //self test FIXME move to a vfilter or something | |
2902 { | |
2903 static volatile int i=0; | |
2904 i++; | |
2905 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH) | |
2906 selfTest(src, srcStride, c->srcW, c->srcH); | |
2907 i--; | |
2908 } | |
2909 #endif | |
2910 | |
2911 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2], | |
2912 //dstStride[0],dstStride[1],dstStride[2]); | |
2913 | |
2914 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0) | |
2915 { | |
2916 static int firstTime=1; //FIXME move this into the context perhaps | |
2917 if(flags & SWS_PRINT_INFO && firstTime) | |
2918 { | |
2919 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n" | |
2920 "SwScaler: ->cannot do aligned memory acesses anymore\n"); | |
2921 firstTime=0; | |
2922 } | |
2923 } | |
2924 | |
2925 /* Note the user might start scaling the picture in the middle so this will not get executed | |
2926 this is not really intended but works currently, so ppl might do it */ | |
2927 if(srcSliceY ==0){ | |
2928 lumBufIndex=0; | |
2929 chrBufIndex=0; | |
2930 dstY=0; | |
2931 lastInLumBuf= -1; | |
2932 lastInChrBuf= -1; | |
2933 } | |
2934 | |
2935 lastDstY= dstY; | |
2936 | |
2937 for(;dstY < dstH; dstY++){ | |
2938 unsigned char *dest =dst[0]+dstStride[0]*dstY; | |
2939 const int chrDstY= dstY>>c->chrDstVSubSample; | |
2940 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY; | |
2941 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY; | |
2942 | |
2943 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input | |
2944 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input | |
2945 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input | |
2946 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input | |
2947 | |
2948 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n", | |
2949 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample); | |
2950 //handle holes (FAST_BILINEAR & weird filters) | |
2951 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; | |
2952 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; | |
2953 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize); | |
2954 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1) | |
2955 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1) | |
2956 | |
2957 // Do we have enough lines in this slice to output the dstY line | |
2958 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample)) | |
2959 { | |
2960 //Do horizontal scaling | |
2961 while(lastInLumBuf < lastLumSrcY) | |
2962 { | |
2963 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; | |
2964 lumBufIndex++; | |
2965 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY); | |
2966 ASSERT(lumBufIndex < 2*vLumBufSize) | |
2967 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
2968 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
2969 // printf("%d %d\n", lumBufIndex, vLumBufSize); | |
2970 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, | |
2971 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, | |
2972 funnyYCode, c->srcFormat, formatConvBuffer, | |
2973 c->lumMmx2Filter, c->lumMmx2FilterPos); | |
2974 lastInLumBuf++; | |
2975 } | |
2976 while(lastInChrBuf < lastChrSrcY) | |
2977 { | |
2978 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; | |
2979 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; | |
2980 chrBufIndex++; | |
2981 ASSERT(chrBufIndex < 2*vChrBufSize) | |
2982 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)) | |
2983 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0) | |
2984 //FIXME replace parameters through context struct (some at least) | |
2985 | |
2986 if(!(isGray(srcFormat) || isGray(dstFormat))) | |
2987 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, | |
2988 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, | |
2989 funnyUVCode, c->srcFormat, formatConvBuffer, | |
2990 c->chrMmx2Filter, c->chrMmx2FilterPos); | |
2991 lastInChrBuf++; | |
2992 } | |
2993 //wrap buf index around to stay inside the ring buffer | |
2994 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
2995 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
2996 } | |
2997 else // not enough lines left in this slice -> load the rest in the buffer | |
2998 { | |
2999 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n", | |
3000 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY, | |
3001 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize, | |
3002 vChrBufSize, vLumBufSize);*/ | |
3003 | |
3004 //Do horizontal scaling | |
3005 while(lastInLumBuf+1 < srcSliceY + srcSliceH) | |
3006 { | |
3007 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; | |
3008 lumBufIndex++; | |
3009 ASSERT(lumBufIndex < 2*vLumBufSize) | |
3010 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
3011 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
3012 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, | |
3013 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, | |
3014 funnyYCode, c->srcFormat, formatConvBuffer, | |
3015 c->lumMmx2Filter, c->lumMmx2FilterPos); | |
3016 lastInLumBuf++; | |
3017 } | |
3018 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH)) | |
3019 { | |
3020 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; | |
3021 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; | |
3022 chrBufIndex++; | |
3023 ASSERT(chrBufIndex < 2*vChrBufSize) | |
3024 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH) | |
3025 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0) | |
3026 | |
3027 if(!(isGray(srcFormat) || isGray(dstFormat))) | |
3028 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, | |
3029 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, | |
3030 funnyUVCode, c->srcFormat, formatConvBuffer, | |
3031 c->chrMmx2Filter, c->chrMmx2FilterPos); | |
3032 lastInChrBuf++; | |
3033 } | |
3034 //wrap buf index around to stay inside the ring buffer | |
3035 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
3036 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
3037 break; //we can't output a dstY line so let's try with the next slice | |
3038 } | |
3039 | |
3040 #ifdef HAVE_MMX | |
3041 b5Dither= dither8[dstY&1]; | |
3042 g6Dither= dither4[dstY&1]; | |
3043 g5Dither= dither8[dstY&1]; | |
3044 r5Dither= dither8[(dstY+1)&1]; | |
3045 #endif | |
3046 if(dstY < dstH-2) | |
3047 { | |
3048 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
3049 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
3050 #ifdef HAVE_MMX | |
3051 int i; | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3052 if(flags & SWS_ACCURATE_RND){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3053 for(i=0; i<vLumFilterSize; i+=2){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3054 lumMmxFilter[2*i+0]= lumSrcPtr[i ]; |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3055 lumMmxFilter[2*i+1]= lumSrcPtr[i+(vLumFilterSize>1)]; |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3056 lumMmxFilter[2*i+2]= |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3057 lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i ] |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3058 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0); |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3059 } |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3060 for(i=0; i<vChrFilterSize; i+=2){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3061 chrMmxFilter[2*i+0]= chrSrcPtr[i ]; |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3062 chrMmxFilter[2*i+1]= chrSrcPtr[i+(vChrFilterSize>1)]; |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3063 chrMmxFilter[2*i+2]= |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3064 chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i ] |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3065 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0); |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3066 } |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3067 }else{ |
18861 | 3068 for(i=0; i<vLumFilterSize; i++) |
3069 { | |
3070 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i]; | |
3071 lumMmxFilter[4*i+2]= | |
3072 lumMmxFilter[4*i+3]= | |
3073 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; | |
3074 } | |
3075 for(i=0; i<vChrFilterSize; i++) | |
3076 { | |
3077 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i]; | |
3078 chrMmxFilter[4*i+2]= | |
3079 chrMmxFilter[4*i+3]= | |
3080 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; | |
3081 } | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3082 } |
18861 | 3083 #endif |
3084 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){ | |
3085 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
3086 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi | |
3087 RENAME(yuv2nv12X)(c, | |
3088 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
3089 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3090 dest, uDest, dstW, chrDstW, dstFormat); | |
3091 } | |
3092 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like | |
3093 { | |
3094 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
3095 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
3096 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12 | |
3097 { | |
3098 int16_t *lumBuf = lumPixBuf[0]; | |
3099 int16_t *chrBuf= chrPixBuf[0]; | |
3100 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW); | |
3101 } | |
3102 else //General YV12 | |
3103 { | |
3104 RENAME(yuv2yuvX)(c, | |
3105 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
3106 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3107 dest, uDest, vDest, dstW, chrDstW); | |
3108 } | |
3109 } | |
3110 else | |
3111 { | |
3112 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
3113 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
3114 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB | |
3115 { | |
3116 int chrAlpha= vChrFilter[2*dstY+1]; | |
3117 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), | |
3118 dest, dstW, chrAlpha, dstFormat, flags, dstY); | |
3119 } | |
3120 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB | |
3121 { | |
3122 int lumAlpha= vLumFilter[2*dstY+1]; | |
3123 int chrAlpha= vChrFilter[2*dstY+1]; | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3124 lumMmxFilter[2]= |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3125 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001; |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3126 chrMmxFilter[2]= |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3127 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001; |
18861 | 3128 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), |
3129 dest, dstW, lumAlpha, chrAlpha, dstY); | |
3130 } | |
3131 else //General RGB | |
3132 { | |
3133 RENAME(yuv2packedX)(c, | |
3134 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
3135 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3136 dest, dstW, dstY); | |
3137 } | |
3138 } | |
3139 } | |
3140 else // hmm looks like we can't use MMX here without overwriting this array's tail | |
3141 { | |
3142 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
3143 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
3144 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){ | |
3145 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
3146 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi | |
3147 yuv2nv12XinC( | |
3148 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
3149 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3150 dest, uDest, dstW, chrDstW, dstFormat); | |
3151 } | |
3152 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 | |
3153 { | |
3154 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
3155 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
3156 yuv2yuvXinC( | |
3157 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
3158 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3159 dest, uDest, vDest, dstW, chrDstW); | |
3160 } | |
3161 else | |
3162 { | |
3163 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
3164 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
3165 yuv2packedXinC(c, | |
3166 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
3167 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3168 dest, dstW, dstY); | |
3169 } | |
3170 } | |
3171 } | |
3172 | |
3173 #ifdef HAVE_MMX | |
3174 __asm __volatile(SFENCE:::"memory"); | |
3175 __asm __volatile(EMMS:::"memory"); | |
3176 #endif | |
3177 /* store changed local vars back in the context */ | |
3178 c->dstY= dstY; | |
3179 c->lumBufIndex= lumBufIndex; | |
3180 c->chrBufIndex= chrBufIndex; | |
3181 c->lastInLumBuf= lastInLumBuf; | |
3182 c->lastInChrBuf= lastInChrBuf; | |
3183 | |
3184 return dstY - lastDstY; | |
3185 } |