Mercurial > mplayer.hg
annotate libswscale/swscale_template.c @ 22598:f39115ea61bb
Add AmigaOS support, patch by Andrea Palmat, andrea amigasoft net.
author | diego |
---|---|
date | Thu, 15 Mar 2007 17:06:28 +0000 |
parents | 508e55817748 |
children | 29827d88d2da |
rev | line source |
---|---|
18861 | 1 /* |
20094
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
3 * |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
4 * This file is part of FFmpeg. |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
5 * |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
6 * FFmpeg is free software; you can redistribute it and/or modify |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
7 * it under the terms of the GNU General Public License as published by |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
8 * the Free Software Foundation; either version 2 of the License, or |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
9 * (at your option) any later version. |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
10 * |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
11 * FFmpeg is distributed in the hope that it will be useful, |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
14 * GNU General Public License for more details. |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
15 * |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
16 * You should have received a copy of the GNU General Public License |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
17 * along with FFmpeg; if not, write to the Free Software |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
19 * |
21029
1f2ba24b4e47
Clarify that some of the non-SIMD code is now LGPLed.
lucabe
parents:
20946
diff
changeset
|
20 * the C code (not assembly, mmx, ...) of this file can be used |
1f2ba24b4e47
Clarify that some of the non-SIMD code is now LGPLed.
lucabe
parents:
20946
diff
changeset
|
21 * under the LGPL license too |
20094
aca9e9783f67
Change license headers to say 'FFmpeg' instead of 'this program'.
diego
parents:
20015
diff
changeset
|
22 */ |
18861 | 23 |
24 #undef REAL_MOVNTQ | |
25 #undef MOVNTQ | |
26 #undef PAVGB | |
27 #undef PREFETCH | |
28 #undef PREFETCHW | |
29 #undef EMMS | |
30 #undef SFENCE | |
31 | |
32 #ifdef HAVE_3DNOW | |
33 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
34 #define EMMS "femms" | |
35 #else | |
36 #define EMMS "emms" | |
37 #endif | |
38 | |
39 #ifdef HAVE_3DNOW | |
40 #define PREFETCH "prefetch" | |
41 #define PREFETCHW "prefetchw" | |
42 #elif defined ( HAVE_MMX2 ) | |
43 #define PREFETCH "prefetchnta" | |
44 #define PREFETCHW "prefetcht0" | |
45 #else | |
20724
b8fe18a742ce
Fix MacIntel build: "/nop" is illegal on Apple's older version of GAS
gpoirier
parents:
20589
diff
changeset
|
46 #define PREFETCH " # nop" |
b8fe18a742ce
Fix MacIntel build: "/nop" is illegal on Apple's older version of GAS
gpoirier
parents:
20589
diff
changeset
|
47 #define PREFETCHW " # nop" |
18861 | 48 #endif |
49 | |
50 #ifdef HAVE_MMX2 | |
51 #define SFENCE "sfence" | |
52 #else | |
20724
b8fe18a742ce
Fix MacIntel build: "/nop" is illegal on Apple's older version of GAS
gpoirier
parents:
20589
diff
changeset
|
53 #define SFENCE " # nop" |
18861 | 54 #endif |
55 | |
56 #ifdef HAVE_MMX2 | |
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | |
58 #elif defined (HAVE_3DNOW) | |
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | |
60 #endif | |
61 | |
62 #ifdef HAVE_MMX2 | |
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" | |
64 #else | |
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" | |
66 #endif | |
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) | |
68 | |
69 #ifdef HAVE_ALTIVEC | |
70 #include "swscale_altivec_template.c" | |
71 #endif | |
72 | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
74 asm volatile(\ |
18861 | 75 "xor %%"REG_a", %%"REG_a" \n\t"\ |
76 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ | |
77 "movq %%mm3, %%mm4 \n\t"\ | |
78 "lea " offset "(%0), %%"REG_d" \n\t"\ | |
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
80 ASMALIGN(4) /* FIXME Unroll? */\ |
18861 | 81 "1: \n\t"\ |
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
83 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\ | |
84 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\ | |
85 "add $16, %%"REG_d" \n\t"\ | |
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
87 "test %%"REG_S", %%"REG_S" \n\t"\ | |
88 "pmulhw %%mm0, %%mm2 \n\t"\ | |
89 "pmulhw %%mm0, %%mm5 \n\t"\ | |
90 "paddw %%mm2, %%mm3 \n\t"\ | |
91 "paddw %%mm5, %%mm4 \n\t"\ | |
92 " jnz 1b \n\t"\ | |
93 "psraw $3, %%mm3 \n\t"\ | |
94 "psraw $3, %%mm4 \n\t"\ | |
95 "packuswb %%mm4, %%mm3 \n\t"\ | |
96 MOVNTQ(%%mm3, (%1, %%REGa))\ | |
97 "add $8, %%"REG_a" \n\t"\ | |
98 "cmp %2, %%"REG_a" \n\t"\ | |
99 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ | |
100 "movq %%mm3, %%mm4 \n\t"\ | |
101 "lea " offset "(%0), %%"REG_d" \n\t"\ | |
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
103 "jb 1b \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
104 :: "r" (&c->redDither),\ |
21325
963e85e82154
Change "p" asm constraints to "g", since "p" was a no longer necessary hack to
reimar
parents:
21029
diff
changeset
|
105 "r" (dest), "g" (width)\ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
106 : "%"REG_a, "%"REG_d, "%"REG_S\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
107 ); |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
108 |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
110 asm volatile(\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
111 "lea " offset "(%0), %%"REG_d" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
112 "xor %%"REG_a", %%"REG_a" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
113 "pxor %%mm4, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
114 "pxor %%mm5, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
115 "pxor %%mm6, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
116 "pxor %%mm7, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
118 ASMALIGN(4) \ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
119 "1: \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
120 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0\n\t" /* srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
121 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
122 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
123 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1\n\t" /* srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
124 "movq %%mm0, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
125 "punpcklwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
126 "punpckhwd %%mm1, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
127 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
128 "pmaddwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
129 "pmaddwd %%mm1, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
130 "paddd %%mm0, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
131 "paddd %%mm3, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
132 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3\n\t" /* srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
133 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
134 "add $16, %%"REG_d" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
135 "test %%"REG_S", %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
136 "movq %%mm2, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
137 "punpcklwd %%mm3, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
138 "punpckhwd %%mm3, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
139 "pmaddwd %%mm1, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
140 "pmaddwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
141 "paddd %%mm2, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
142 "paddd %%mm0, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
143 " jnz 1b \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
144 "psrad $16, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
145 "psrad $16, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
146 "psrad $16, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
147 "psrad $16, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
148 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
149 "packssdw %%mm5, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
150 "packssdw %%mm7, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
151 "paddw %%mm0, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
152 "paddw %%mm0, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
153 "psraw $3, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
154 "psraw $3, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
155 "packuswb %%mm6, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
156 MOVNTQ(%%mm4, (%1, %%REGa))\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
157 "add $8, %%"REG_a" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
158 "cmp %2, %%"REG_a" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
159 "lea " offset "(%0), %%"REG_d" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
160 "pxor %%mm4, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
161 "pxor %%mm5, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
162 "pxor %%mm6, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
163 "pxor %%mm7, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
165 "jb 1b \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
166 :: "r" (&c->redDither),\ |
21325
963e85e82154
Change "p" asm constraints to "g", since "p" was a no longer necessary hack to
reimar
parents:
21029
diff
changeset
|
167 "r" (dest), "g" (width)\ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
168 : "%"REG_a, "%"REG_d, "%"REG_S\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
169 ); |
18861 | 170 |
171 #define YSCALEYUV2YV121 \ | |
172 "mov %2, %%"REG_a" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
173 ASMALIGN(4) /* FIXME Unroll? */\ |
18861 | 174 "1: \n\t"\ |
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ | |
176 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\ | |
177 "psraw $7, %%mm0 \n\t"\ | |
178 "psraw $7, %%mm1 \n\t"\ | |
179 "packuswb %%mm1, %%mm0 \n\t"\ | |
180 MOVNTQ(%%mm0, (%1, %%REGa))\ | |
181 "add $8, %%"REG_a" \n\t"\ | |
182 "jnc 1b \n\t" | |
183 | |
184 /* | |
185 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
186 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
187 "r" (dest), "m" (dstW), | |
188 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
189 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
190 */ | |
191 #define YSCALEYUV2PACKEDX \ | |
19173 | 192 asm volatile(\ |
18861 | 193 "xor %%"REG_a", %%"REG_a" \n\t"\ |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
194 ASMALIGN(4)\ |
18861 | 195 "nop \n\t"\ |
196 "1: \n\t"\ | |
197 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ | |
198 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
199 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ | |
200 "movq %%mm3, %%mm4 \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
201 ASMALIGN(4)\ |
18861 | 202 "2: \n\t"\ |
203 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
204 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ | |
205 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ | |
206 "add $16, %%"REG_d" \n\t"\ | |
207 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
208 "pmulhw %%mm0, %%mm2 \n\t"\ | |
209 "pmulhw %%mm0, %%mm5 \n\t"\ | |
210 "paddw %%mm2, %%mm3 \n\t"\ | |
211 "paddw %%mm5, %%mm4 \n\t"\ | |
212 "test %%"REG_S", %%"REG_S" \n\t"\ | |
213 " jnz 2b \n\t"\ | |
214 \ | |
215 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ | |
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
217 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\ | |
218 "movq %%mm1, %%mm7 \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
219 ASMALIGN(4)\ |
18861 | 220 "2: \n\t"\ |
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |
222 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\ | |
223 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\ | |
224 "add $16, %%"REG_d" \n\t"\ | |
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |
226 "pmulhw %%mm0, %%mm2 \n\t"\ | |
227 "pmulhw %%mm0, %%mm5 \n\t"\ | |
228 "paddw %%mm2, %%mm1 \n\t"\ | |
229 "paddw %%mm5, %%mm7 \n\t"\ | |
230 "test %%"REG_S", %%"REG_S" \n\t"\ | |
231 " jnz 2b \n\t"\ | |
232 | |
19173 | 233 #define YSCALEYUV2PACKEDX_END\ |
234 :: "r" (&c->redDither), \ | |
235 "m" (dummy), "m" (dummy), "m" (dummy),\ | |
236 "r" (dest), "m" (dstW)\ | |
237 : "%"REG_a, "%"REG_d, "%"REG_S\ | |
238 ); | |
239 | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
240 #define YSCALEYUV2PACKEDX_ACCURATE \ |
19173 | 241 asm volatile(\ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
242 "xor %%"REG_a", %%"REG_a" \n\t"\ |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
243 ASMALIGN(4)\ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
244 "nop \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
245 "1: \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
246 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
247 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
248 "pxor %%mm4, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
249 "pxor %%mm5, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
250 "pxor %%mm6, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
251 "pxor %%mm7, %%mm7 \n\t"\ |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
252 ASMALIGN(4)\ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
253 "2: \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
254 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
255 "movq 4096(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
256 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
257 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
258 "movq %%mm0, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
259 "punpcklwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
260 "punpckhwd %%mm1, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
261 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
262 "pmaddwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
263 "pmaddwd %%mm1, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
264 "paddd %%mm0, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
265 "paddd %%mm3, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
266 "movq 4096(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
267 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
268 "add $16, %%"REG_d" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
269 "test %%"REG_S", %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
270 "movq %%mm2, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
271 "punpcklwd %%mm3, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
272 "punpckhwd %%mm3, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
273 "pmaddwd %%mm1, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
274 "pmaddwd %%mm1, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
275 "paddd %%mm2, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
276 "paddd %%mm0, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
277 " jnz 2b \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
278 "psrad $16, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
279 "psrad $16, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
280 "psrad $16, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
281 "psrad $16, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
282 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
283 "packssdw %%mm5, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
284 "packssdw %%mm7, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
285 "paddw %%mm0, %%mm4 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
286 "paddw %%mm0, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
287 "movq %%mm4, "U_TEMP"(%0) \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
288 "movq %%mm6, "V_TEMP"(%0) \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
289 \ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
290 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
291 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
292 "pxor %%mm1, %%mm1 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
293 "pxor %%mm5, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
294 "pxor %%mm7, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
295 "pxor %%mm6, %%mm6 \n\t"\ |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
296 ASMALIGN(4)\ |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
297 "2: \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
298 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
299 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
300 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
301 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
302 "movq %%mm0, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
303 "punpcklwd %%mm4, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
304 "punpckhwd %%mm4, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
305 "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
306 "pmaddwd %%mm4, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
307 "pmaddwd %%mm4, %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
308 "paddd %%mm0, %%mm1 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
309 "paddd %%mm3, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
310 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
311 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
312 "add $16, %%"REG_d" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
313 "test %%"REG_S", %%"REG_S" \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
314 "movq %%mm2, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
315 "punpcklwd %%mm3, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
316 "punpckhwd %%mm3, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
317 "pmaddwd %%mm4, %%mm2 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
318 "pmaddwd %%mm4, %%mm0 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
319 "paddd %%mm2, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
320 "paddd %%mm0, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
321 " jnz 2b \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
322 "psrad $16, %%mm1 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
323 "psrad $16, %%mm5 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
324 "psrad $16, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
325 "psrad $16, %%mm6 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
326 "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
327 "packssdw %%mm5, %%mm1 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
328 "packssdw %%mm6, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
329 "paddw %%mm0, %%mm1 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
330 "paddw %%mm0, %%mm7 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
331 "movq "U_TEMP"(%0), %%mm3 \n\t"\ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
332 "movq "V_TEMP"(%0), %%mm4 \n\t"\ |
18861 | 333 |
19173 | 334 #define YSCALEYUV2RGBX \ |
18861 | 335 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ |
336 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ | |
337 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
338 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
339 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ | |
340 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ | |
341 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
342 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ | |
343 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ | |
344 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ | |
345 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ | |
346 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ | |
347 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ | |
348 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
349 "paddw %%mm3, %%mm4 \n\t"\ | |
350 "movq %%mm2, %%mm0 \n\t"\ | |
351 "movq %%mm5, %%mm6 \n\t"\ | |
352 "movq %%mm4, %%mm3 \n\t"\ | |
353 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
354 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
355 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
356 "paddw %%mm1, %%mm2 \n\t"\ | |
357 "paddw %%mm1, %%mm5 \n\t"\ | |
358 "paddw %%mm1, %%mm4 \n\t"\ | |
359 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
360 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
361 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
362 "paddw %%mm7, %%mm0 \n\t"\ | |
363 "paddw %%mm7, %%mm6 \n\t"\ | |
364 "paddw %%mm7, %%mm3 \n\t"\ | |
365 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
366 "packuswb %%mm0, %%mm2 \n\t"\ | |
367 "packuswb %%mm6, %%mm5 \n\t"\ | |
368 "packuswb %%mm3, %%mm4 \n\t"\ | |
369 "pxor %%mm7, %%mm7 \n\t" | |
370 #if 0 | |
371 #define FULL_YSCALEYUV2RGB \ | |
372 "pxor %%mm7, %%mm7 \n\t"\ | |
373 "movd %6, %%mm6 \n\t" /*yalpha1*/\ | |
374 "punpcklwd %%mm6, %%mm6 \n\t"\ | |
375 "punpcklwd %%mm6, %%mm6 \n\t"\ | |
376 "movd %7, %%mm5 \n\t" /*uvalpha1*/\ | |
377 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
378 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
379 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
380 ASMALIGN(4)\ |
18861 | 381 "1: \n\t"\ |
382 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
383 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
384 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
385 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
386 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
387 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
388 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
389 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
390 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
391 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
392 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
393 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
394 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |
395 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
396 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
397 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\ | |
398 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\ | |
399 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\ | |
400 \ | |
401 \ | |
402 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
403 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
404 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\ | |
405 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
406 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\ | |
407 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
408 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\ | |
409 \ | |
410 \ | |
411 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | |
412 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\ | |
413 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\ | |
414 "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |
415 "paddw %%mm1, %%mm0 \n\t" /* R*/\ | |
416 "packuswb %%mm3, %%mm3 \n\t"\ | |
417 \ | |
418 "packuswb %%mm0, %%mm0 \n\t"\ | |
419 "paddw %%mm4, %%mm2 \n\t"\ | |
420 "paddw %%mm2, %%mm1 \n\t" /* G*/\ | |
421 \ | |
422 "packuswb %%mm1, %%mm1 \n\t" | |
423 #endif | |
424 | |
425 #define REAL_YSCALEYUV2PACKED(index, c) \ | |
426 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ | |
427 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\ | |
428 "psraw $3, %%mm0 \n\t"\ | |
429 "psraw $3, %%mm1 \n\t"\ | |
430 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\ | |
431 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\ | |
432 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
433 ASMALIGN(4)\ |
18861 | 434 "1: \n\t"\ |
435 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
436 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
437 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
438 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
439 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
440 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
441 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ | |
442 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
443 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
444 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
445 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
446 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
447 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
448 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
449 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
450 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ | |
451 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ | |
452 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
453 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
455 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
456 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
457 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
458 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
459 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
460 | |
461 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) | |
462 | |
463 #define REAL_YSCALEYUV2RGB(index, c) \ | |
464 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
465 ASMALIGN(4)\ |
18861 | 466 "1: \n\t"\ |
467 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
468 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
469 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\ | |
470 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\ | |
471 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |
472 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
473 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ | |
474 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |
475 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
476 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
477 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
478 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
479 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
480 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
481 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
482 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
483 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
484 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ | |
485 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
486 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
487 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ | |
488 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
489 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\ | |
490 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\ | |
491 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
492 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
493 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
494 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
495 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
496 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
497 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
498 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
499 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ | |
500 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
501 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
502 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
503 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
504 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
505 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
506 "paddw %%mm3, %%mm4 \n\t"\ | |
507 "movq %%mm2, %%mm0 \n\t"\ | |
508 "movq %%mm5, %%mm6 \n\t"\ | |
509 "movq %%mm4, %%mm3 \n\t"\ | |
510 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
511 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
512 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
513 "paddw %%mm1, %%mm2 \n\t"\ | |
514 "paddw %%mm1, %%mm5 \n\t"\ | |
515 "paddw %%mm1, %%mm4 \n\t"\ | |
516 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
517 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
518 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
519 "paddw %%mm7, %%mm0 \n\t"\ | |
520 "paddw %%mm7, %%mm6 \n\t"\ | |
521 "paddw %%mm7, %%mm3 \n\t"\ | |
522 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
523 "packuswb %%mm0, %%mm2 \n\t"\ | |
524 "packuswb %%mm6, %%mm5 \n\t"\ | |
525 "packuswb %%mm3, %%mm4 \n\t"\ | |
526 "pxor %%mm7, %%mm7 \n\t" | |
527 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c) | |
528 | |
529 #define REAL_YSCALEYUV2PACKED1(index, c) \ | |
530 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
531 ASMALIGN(4)\ |
18861 | 532 "1: \n\t"\ |
533 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ | |
534 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
535 "psraw $7, %%mm3 \n\t" \ | |
536 "psraw $7, %%mm4 \n\t" \ | |
537 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
538 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
539 "psraw $7, %%mm1 \n\t" \ | |
540 "psraw $7, %%mm7 \n\t" \ | |
541 | |
542 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) | |
543 | |
544 #define REAL_YSCALEYUV2RGB1(index, c) \ | |
545 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
546 ASMALIGN(4)\ |
18861 | 547 "1: \n\t"\ |
548 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ | |
549 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
550 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
551 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
552 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
553 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
554 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
555 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
556 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ | |
557 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
558 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
559 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
560 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
561 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
562 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
563 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ | |
564 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
565 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
566 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
567 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
568 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
569 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
570 "paddw %%mm3, %%mm4 \n\t"\ | |
571 "movq %%mm2, %%mm0 \n\t"\ | |
572 "movq %%mm5, %%mm6 \n\t"\ | |
573 "movq %%mm4, %%mm3 \n\t"\ | |
574 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
575 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
576 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
577 "paddw %%mm1, %%mm2 \n\t"\ | |
578 "paddw %%mm1, %%mm5 \n\t"\ | |
579 "paddw %%mm1, %%mm4 \n\t"\ | |
580 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
581 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
582 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
583 "paddw %%mm7, %%mm0 \n\t"\ | |
584 "paddw %%mm7, %%mm6 \n\t"\ | |
585 "paddw %%mm7, %%mm3 \n\t"\ | |
586 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
587 "packuswb %%mm0, %%mm2 \n\t"\ | |
588 "packuswb %%mm6, %%mm5 \n\t"\ | |
589 "packuswb %%mm3, %%mm4 \n\t"\ | |
590 "pxor %%mm7, %%mm7 \n\t" | |
591 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) | |
592 | |
593 #define REAL_YSCALEYUV2PACKED1b(index, c) \ | |
594 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
595 ASMALIGN(4)\ |
18861 | 596 "1: \n\t"\ |
597 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
598 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
599 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
600 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
601 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ | |
602 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
603 "psrlw $8, %%mm3 \n\t" \ | |
604 "psrlw $8, %%mm4 \n\t" \ | |
605 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
606 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
607 "psraw $7, %%mm1 \n\t" \ | |
608 "psraw $7, %%mm7 \n\t" | |
609 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) | |
610 | |
611 // do vertical chrominance interpolation | |
612 #define REAL_YSCALEYUV2RGB1b(index, c) \ | |
613 "xor "#index", "#index" \n\t"\ | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
614 ASMALIGN(4)\ |
18861 | 615 "1: \n\t"\ |
616 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |
617 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
618 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
619 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
620 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ | |
621 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
622 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ | |
623 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ | |
624 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | |
625 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
626 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
627 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
628 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ | |
629 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
630 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
631 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | |
632 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
633 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
634 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
635 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ | |
636 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
637 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
638 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
639 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
640 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
641 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
642 "paddw %%mm3, %%mm4 \n\t"\ | |
643 "movq %%mm2, %%mm0 \n\t"\ | |
644 "movq %%mm5, %%mm6 \n\t"\ | |
645 "movq %%mm4, %%mm3 \n\t"\ | |
646 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
647 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
648 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
649 "paddw %%mm1, %%mm2 \n\t"\ | |
650 "paddw %%mm1, %%mm5 \n\t"\ | |
651 "paddw %%mm1, %%mm4 \n\t"\ | |
652 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
653 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
654 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
655 "paddw %%mm7, %%mm0 \n\t"\ | |
656 "paddw %%mm7, %%mm6 \n\t"\ | |
657 "paddw %%mm7, %%mm3 \n\t"\ | |
658 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
659 "packuswb %%mm0, %%mm2 \n\t"\ | |
660 "packuswb %%mm6, %%mm5 \n\t"\ | |
661 "packuswb %%mm3, %%mm4 \n\t"\ | |
662 "pxor %%mm7, %%mm7 \n\t" | |
663 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) | |
664 | |
665 #define REAL_WRITEBGR32(dst, dstw, index) \ | |
666 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
667 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
668 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
669 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
670 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
671 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
672 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
673 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
674 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
675 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
676 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
677 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
678 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
679 \ | |
680 MOVNTQ(%%mm0, (dst, index, 4))\ | |
681 MOVNTQ(%%mm2, 8(dst, index, 4))\ | |
682 MOVNTQ(%%mm1, 16(dst, index, 4))\ | |
683 MOVNTQ(%%mm3, 24(dst, index, 4))\ | |
684 \ | |
685 "add $8, "#index" \n\t"\ | |
686 "cmp "#dstw", "#index" \n\t"\ | |
687 " jb 1b \n\t" | |
688 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index) | |
689 | |
690 #define REAL_WRITEBGR16(dst, dstw, index) \ | |
691 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ | |
692 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ | |
693 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
694 "psrlq $3, %%mm2 \n\t"\ | |
695 \ | |
696 "movq %%mm2, %%mm1 \n\t"\ | |
697 "movq %%mm4, %%mm3 \n\t"\ | |
698 \ | |
699 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
700 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
701 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
702 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
703 \ | |
704 "psllq $3, %%mm3 \n\t"\ | |
705 "psllq $3, %%mm4 \n\t"\ | |
706 \ | |
707 "por %%mm3, %%mm2 \n\t"\ | |
708 "por %%mm4, %%mm1 \n\t"\ | |
709 \ | |
710 MOVNTQ(%%mm2, (dst, index, 2))\ | |
711 MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
712 \ | |
713 "add $8, "#index" \n\t"\ | |
714 "cmp "#dstw", "#index" \n\t"\ | |
715 " jb 1b \n\t" | |
716 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index) | |
717 | |
718 #define REAL_WRITEBGR15(dst, dstw, index) \ | |
719 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ | |
720 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ | |
721 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
722 "psrlq $3, %%mm2 \n\t"\ | |
723 "psrlq $1, %%mm5 \n\t"\ | |
724 \ | |
725 "movq %%mm2, %%mm1 \n\t"\ | |
726 "movq %%mm4, %%mm3 \n\t"\ | |
727 \ | |
728 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
729 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
730 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
731 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
732 \ | |
733 "psllq $2, %%mm3 \n\t"\ | |
734 "psllq $2, %%mm4 \n\t"\ | |
735 \ | |
736 "por %%mm3, %%mm2 \n\t"\ | |
737 "por %%mm4, %%mm1 \n\t"\ | |
738 \ | |
739 MOVNTQ(%%mm2, (dst, index, 2))\ | |
740 MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
741 \ | |
742 "add $8, "#index" \n\t"\ | |
743 "cmp "#dstw", "#index" \n\t"\ | |
744 " jb 1b \n\t" | |
745 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index) | |
746 | |
747 #define WRITEBGR24OLD(dst, dstw, index) \ | |
748 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
749 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
750 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
751 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
752 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
753 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
754 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
755 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
756 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
757 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
758 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
759 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
760 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
761 \ | |
762 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
763 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ | |
764 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\ | |
765 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\ | |
766 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
767 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ | |
768 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ | |
769 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
770 \ | |
771 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
772 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ | |
773 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ | |
774 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ | |
775 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\ | |
776 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ | |
777 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ | |
778 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\ | |
779 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\ | |
780 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ | |
781 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ | |
782 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ | |
783 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ | |
784 \ | |
785 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ | |
786 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ | |
787 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ | |
788 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\ | |
789 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\ | |
790 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ | |
791 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
792 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ | |
793 \ | |
794 MOVNTQ(%%mm0, (dst))\ | |
795 MOVNTQ(%%mm2, 8(dst))\ | |
796 MOVNTQ(%%mm3, 16(dst))\ | |
797 "add $24, "#dst" \n\t"\ | |
798 \ | |
799 "add $8, "#index" \n\t"\ | |
800 "cmp "#dstw", "#index" \n\t"\ | |
801 " jb 1b \n\t" | |
802 | |
803 #define WRITEBGR24MMX(dst, dstw, index) \ | |
804 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
805 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
806 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
807 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
808 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
809 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
810 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
811 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
812 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
813 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
814 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
815 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
816 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
817 \ | |
818 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
819 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ | |
820 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ | |
821 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ | |
822 \ | |
823 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ | |
824 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ | |
825 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ | |
826 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ | |
827 \ | |
828 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ | |
829 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ | |
830 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ | |
831 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ | |
832 \ | |
833 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
834 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ | |
835 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ | |
836 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
837 MOVNTQ(%%mm0, (dst))\ | |
838 \ | |
839 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ | |
840 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ | |
841 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ | |
842 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ | |
843 MOVNTQ(%%mm6, 8(dst))\ | |
844 \ | |
845 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ | |
846 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
847 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ | |
848 MOVNTQ(%%mm5, 16(dst))\ | |
849 \ | |
850 "add $24, "#dst" \n\t"\ | |
851 \ | |
852 "add $8, "#index" \n\t"\ | |
853 "cmp "#dstw", "#index" \n\t"\ | |
854 " jb 1b \n\t" | |
855 | |
856 #define WRITEBGR24MMX2(dst, dstw, index) \ | |
857 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
858 "movq "MANGLE(M24A)", %%mm0 \n\t"\ | |
859 "movq "MANGLE(M24C)", %%mm7 \n\t"\ | |
860 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ | |
861 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ | |
862 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ | |
863 \ | |
864 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ | |
865 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ | |
866 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ | |
867 \ | |
868 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ | |
869 "por %%mm1, %%mm6 \n\t"\ | |
870 "por %%mm3, %%mm6 \n\t"\ | |
871 MOVNTQ(%%mm6, (dst))\ | |
872 \ | |
873 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ | |
874 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ | |
875 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ | |
876 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ | |
877 \ | |
878 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ | |
879 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ | |
880 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ | |
881 \ | |
882 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ | |
883 "por %%mm3, %%mm6 \n\t"\ | |
884 MOVNTQ(%%mm6, 8(dst))\ | |
885 \ | |
886 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ | |
887 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ | |
888 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ | |
889 \ | |
890 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ | |
891 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ | |
892 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ | |
893 \ | |
894 "por %%mm1, %%mm3 \n\t"\ | |
895 "por %%mm3, %%mm6 \n\t"\ | |
896 MOVNTQ(%%mm6, 16(dst))\ | |
897 \ | |
898 "add $24, "#dst" \n\t"\ | |
899 \ | |
900 "add $8, "#index" \n\t"\ | |
901 "cmp "#dstw", "#index" \n\t"\ | |
902 " jb 1b \n\t" | |
903 | |
904 #ifdef HAVE_MMX2 | |
905 #undef WRITEBGR24 | |
906 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index) | |
907 #else | |
908 #undef WRITEBGR24 | |
909 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) | |
910 #endif | |
911 | |
912 #define REAL_WRITEYUY2(dst, dstw, index) \ | |
913 "packuswb %%mm3, %%mm3 \n\t"\ | |
914 "packuswb %%mm4, %%mm4 \n\t"\ | |
915 "packuswb %%mm7, %%mm1 \n\t"\ | |
916 "punpcklbw %%mm4, %%mm3 \n\t"\ | |
917 "movq %%mm1, %%mm7 \n\t"\ | |
918 "punpcklbw %%mm3, %%mm1 \n\t"\ | |
919 "punpckhbw %%mm3, %%mm7 \n\t"\ | |
920 \ | |
921 MOVNTQ(%%mm1, (dst, index, 2))\ | |
922 MOVNTQ(%%mm7, 8(dst, index, 2))\ | |
923 \ | |
924 "add $8, "#index" \n\t"\ | |
925 "cmp "#dstw", "#index" \n\t"\ | |
926 " jb 1b \n\t" | |
927 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) | |
928 | |
929 | |
930 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | |
931 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
932 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) | |
933 { | |
934 #ifdef HAVE_MMX | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
935 if(c->flags & SWS_ACCURATE_RND){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
936 if(uDest){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
937 YSCALEYUV2YV12X_ACCURATE( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
938 YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
939 } |
18861 | 940 |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
941 YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
942 }else{ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
943 if(uDest){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
944 YSCALEYUV2YV12X( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
945 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
946 } |
18861 | 947 |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
948 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
949 } |
18861 | 950 #else |
951 #ifdef HAVE_ALTIVEC | |
952 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, | |
953 chrFilter, chrSrc, chrFilterSize, | |
954 dest, uDest, vDest, dstW, chrDstW); | |
955 #else //HAVE_ALTIVEC | |
956 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, | |
957 chrFilter, chrSrc, chrFilterSize, | |
958 dest, uDest, vDest, dstW, chrDstW); | |
959 #endif //!HAVE_ALTIVEC | |
960 #endif | |
961 } | |
962 | |
963 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | |
964 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
965 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat) | |
966 { | |
967 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, | |
968 chrFilter, chrSrc, chrFilterSize, | |
969 dest, uDest, dstW, chrDstW, dstFormat); | |
970 } | |
971 | |
972 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc, | |
973 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) | |
974 { | |
975 #ifdef HAVE_MMX | |
976 if(uDest != NULL) | |
977 { | |
978 asm volatile( | |
979 YSCALEYUV2YV121 | |
980 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW), | |
981 "g" (-chrDstW) | |
982 : "%"REG_a | |
983 ); | |
984 | |
985 asm volatile( | |
986 YSCALEYUV2YV121 | |
987 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW), | |
988 "g" (-chrDstW) | |
989 : "%"REG_a | |
990 ); | |
991 } | |
992 | |
993 asm volatile( | |
994 YSCALEYUV2YV121 | |
995 :: "r" (lumSrc + dstW), "r" (dest + dstW), | |
996 "g" (-dstW) | |
997 : "%"REG_a | |
998 ); | |
999 #else | |
1000 int i; | |
1001 for(i=0; i<dstW; i++) | |
1002 { | |
1003 int val= lumSrc[i]>>7; | |
1004 | |
1005 if(val&256){ | |
1006 if(val<0) val=0; | |
1007 else val=255; | |
1008 } | |
1009 | |
1010 dest[i]= val; | |
1011 } | |
1012 | |
1013 if(uDest != NULL) | |
1014 for(i=0; i<chrDstW; i++) | |
1015 { | |
1016 int u=chrSrc[i]>>7; | |
1017 int v=chrSrc[i + 2048]>>7; | |
1018 | |
1019 if((u|v)&256){ | |
1020 if(u<0) u=0; | |
1021 else if (u>255) u=255; | |
1022 if(v<0) v=0; | |
1023 else if (v>255) v=255; | |
1024 } | |
1025 | |
1026 uDest[i]= u; | |
1027 vDest[i]= v; | |
1028 } | |
1029 #endif | |
1030 } | |
1031 | |
1032 | |
1033 /** | |
1034 * vertical scale YV12 to RGB | |
1035 */ | |
1036 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | |
1037 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
1038 uint8_t *dest, long dstW, long dstY) | |
1039 { | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1040 #ifdef HAVE_MMX |
20015
d08ba4508bb0
Fix unused variable warning when compiling with MMX disabled.
diego
parents:
19872
diff
changeset
|
1041 long dummy=0; |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1042 if(c->flags & SWS_ACCURATE_RND){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1043 switch(c->dstFormat){ |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1044 case PIX_FMT_RGB32: |
19173 | 1045 YSCALEYUV2PACKEDX_ACCURATE |
1046 YSCALEYUV2RGBX | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1047 WRITEBGR32(%4, %5, %%REGa) |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1048 |
19173 | 1049 YSCALEYUV2PACKEDX_END |
1050 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1051 case PIX_FMT_BGR24: |
19173 | 1052 YSCALEYUV2PACKEDX_ACCURATE |
1053 YSCALEYUV2RGBX | |
19396 | 1054 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize |
1055 "add %4, %%"REG_c" \n\t" | |
1056 WRITEBGR24(%%REGc, %5, %%REGa) | |
19173 | 1057 |
1058 | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1059 :: "r" (&c->redDither), |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1060 "m" (dummy), "m" (dummy), "m" (dummy), |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1061 "r" (dest), "m" (dstW) |
19396 | 1062 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1063 ); |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1064 return; |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1065 case PIX_FMT_BGR555: |
19173 | 1066 YSCALEYUV2PACKEDX_ACCURATE |
1067 YSCALEYUV2RGBX | |
1068 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1069 #ifdef DITHER1XBPP | |
1070 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1071 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1072 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1073 #endif | |
1074 | |
1075 WRITEBGR15(%4, %5, %%REGa) | |
1076 YSCALEYUV2PACKEDX_END | |
1077 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1078 case PIX_FMT_BGR565: |
19173 | 1079 YSCALEYUV2PACKEDX_ACCURATE |
1080 YSCALEYUV2RGBX | |
1081 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1082 #ifdef DITHER1XBPP | |
1083 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1084 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1085 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1086 #endif | |
1087 | |
1088 WRITEBGR16(%4, %5, %%REGa) | |
1089 YSCALEYUV2PACKEDX_END | |
1090 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1091 case PIX_FMT_YUYV422: |
19173 | 1092 YSCALEYUV2PACKEDX_ACCURATE |
1093 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1094 | |
1095 "psraw $3, %%mm3 \n\t" | |
1096 "psraw $3, %%mm4 \n\t" | |
1097 "psraw $3, %%mm1 \n\t" | |
1098 "psraw $3, %%mm7 \n\t" | |
1099 WRITEYUY2(%4, %5, %%REGa) | |
1100 YSCALEYUV2PACKEDX_END | |
1101 return; | |
1102 } | |
1103 }else{ | |
1104 switch(c->dstFormat) | |
1105 { | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1106 case PIX_FMT_RGB32: |
19173 | 1107 YSCALEYUV2PACKEDX |
1108 YSCALEYUV2RGBX | |
1109 WRITEBGR32(%4, %5, %%REGa) | |
1110 YSCALEYUV2PACKEDX_END | |
1111 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1112 case PIX_FMT_BGR24: |
19173 | 1113 YSCALEYUV2PACKEDX |
1114 YSCALEYUV2RGBX | |
19396 | 1115 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize |
1116 "add %4, %%"REG_c" \n\t" | |
1117 WRITEBGR24(%%REGc, %5, %%REGa) | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1118 |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1119 :: "r" (&c->redDither), |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1120 "m" (dummy), "m" (dummy), "m" (dummy), |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1121 "r" (dest), "m" (dstW) |
19396 | 1122 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1123 ); |
19173 | 1124 return; |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1125 case PIX_FMT_BGR555: |
19173 | 1126 YSCALEYUV2PACKEDX |
1127 YSCALEYUV2RGBX | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1128 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1129 #ifdef DITHER1XBPP |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1130 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1131 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1132 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1133 #endif |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1134 |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1135 WRITEBGR15(%4, %5, %%REGa) |
19173 | 1136 YSCALEYUV2PACKEDX_END |
1137 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1138 case PIX_FMT_BGR565: |
19173 | 1139 YSCALEYUV2PACKEDX |
1140 YSCALEYUV2RGBX | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1141 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1142 #ifdef DITHER1XBPP |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1143 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1144 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1145 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1146 #endif |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1147 |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1148 WRITEBGR16(%4, %5, %%REGa) |
19173 | 1149 YSCALEYUV2PACKEDX_END |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1150 return; |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1151 case PIX_FMT_YUYV422: |
18861 | 1152 YSCALEYUV2PACKEDX |
1153 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1154 | |
1155 "psraw $3, %%mm3 \n\t" | |
1156 "psraw $3, %%mm4 \n\t" | |
1157 "psraw $3, %%mm1 \n\t" | |
1158 "psraw $3, %%mm7 \n\t" | |
1159 WRITEYUY2(%4, %5, %%REGa) | |
19173 | 1160 YSCALEYUV2PACKEDX_END |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1161 return; |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1162 } |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
1163 } |
18861 | 1164 #endif |
1165 #ifdef HAVE_ALTIVEC | |
1166 /* The following list of supported dstFormat values should | |
1167 match what's found in the body of altivec_yuv2packedX() */ | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1168 if(c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA || |
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1169 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 || |
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1170 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB) |
18861 | 1171 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize, |
1172 chrFilter, chrSrc, chrFilterSize, | |
1173 dest, dstW, dstY); | |
1174 else | |
1175 #endif | |
1176 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, | |
1177 chrFilter, chrSrc, chrFilterSize, | |
1178 dest, dstW, dstY); | |
1179 } | |
1180 | |
1181 /** | |
1182 * vertical bilinear scale YV12 to RGB | |
1183 */ | |
1184 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, | |
1185 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) | |
1186 { | |
1187 int yalpha1=yalpha^4095; | |
1188 int uvalpha1=uvalpha^4095; | |
1189 int i; | |
1190 | |
1191 #if 0 //isn't used | |
1192 if(flags&SWS_FULL_CHR_H_INT) | |
1193 { | |
1194 switch(dstFormat) | |
1195 { | |
1196 #ifdef HAVE_MMX | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1197 case PIX_FMT_RGB32: |
18861 | 1198 asm volatile( |
1199 | |
1200 | |
1201 FULL_YSCALEYUV2RGB | |
1202 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
1203 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
1204 | |
1205 "movq %%mm3, %%mm1 \n\t" | |
1206 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
1207 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
1208 | |
1209 MOVNTQ(%%mm3, (%4, %%REGa, 4)) | |
1210 MOVNTQ(%%mm1, 8(%4, %%REGa, 4)) | |
1211 | |
1212 "add $4, %%"REG_a" \n\t" | |
1213 "cmp %5, %%"REG_a" \n\t" | |
1214 " jb 1b \n\t" | |
1215 | |
1216 | |
1217 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW), | |
1218 "m" (yalpha1), "m" (uvalpha1) | |
1219 : "%"REG_a | |
1220 ); | |
1221 break; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1222 case PIX_FMT_BGR24: |
18861 | 1223 asm volatile( |
1224 | |
1225 FULL_YSCALEYUV2RGB | |
1226 | |
1227 // lsb ... msb | |
1228 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |
1229 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |
1230 | |
1231 "movq %%mm3, %%mm1 \n\t" | |
1232 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |
1233 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |
1234 | |
1235 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0 | |
1236 "psrlq $8, %%mm3 \n\t" // GR0BGR00 | |
1237 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000 | |
1238 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00 | |
1239 "por %%mm2, %%mm3 \n\t" // BGRBGR00 | |
1240 "movq %%mm1, %%mm2 \n\t" | |
1241 "psllq $48, %%mm1 \n\t" // 000000BG | |
1242 "por %%mm1, %%mm3 \n\t" // BGRBGRBG | |
1243 | |
1244 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0 | |
1245 "psrld $16, %%mm2 \n\t" // R000R000 | |
1246 "psrlq $24, %%mm1 \n\t" // 0BGR0000 | |
1247 "por %%mm2, %%mm1 \n\t" // RBGRR000 | |
1248 | |
1249 "mov %4, %%"REG_b" \n\t" | |
1250 "add %%"REG_a", %%"REG_b" \n\t" | |
1251 | |
1252 #ifdef HAVE_MMX2 | |
1253 //FIXME Alignment | |
1254 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t" | |
1255 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t" | |
1256 #else | |
1257 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t" | |
1258 "psrlq $32, %%mm3 \n\t" | |
1259 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t" | |
1260 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t" | |
1261 #endif | |
1262 "add $4, %%"REG_a" \n\t" | |
1263 "cmp %5, %%"REG_a" \n\t" | |
1264 " jb 1b \n\t" | |
1265 | |
1266 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), | |
1267 "m" (yalpha1), "m" (uvalpha1) | |
1268 : "%"REG_a, "%"REG_b | |
1269 ); | |
1270 break; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1271 case PIX_FMT_BGR555: |
18861 | 1272 asm volatile( |
1273 | |
1274 FULL_YSCALEYUV2RGB | |
1275 #ifdef DITHER1XBPP | |
1276 "paddusb "MANGLE(g5Dither)", %%mm1\n\t" | |
1277 "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
1278 "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
1279 #endif | |
1280 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |
1281 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
1282 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
1283 | |
1284 "psrlw $3, %%mm3 \n\t" | |
1285 "psllw $2, %%mm1 \n\t" | |
1286 "psllw $7, %%mm0 \n\t" | |
1287 "pand "MANGLE(g15Mask)", %%mm1 \n\t" | |
1288 "pand "MANGLE(r15Mask)", %%mm0 \n\t" | |
1289 | |
1290 "por %%mm3, %%mm1 \n\t" | |
1291 "por %%mm1, %%mm0 \n\t" | |
1292 | |
1293 MOVNTQ(%%mm0, (%4, %%REGa, 2)) | |
1294 | |
1295 "add $4, %%"REG_a" \n\t" | |
1296 "cmp %5, %%"REG_a" \n\t" | |
1297 " jb 1b \n\t" | |
1298 | |
1299 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), | |
1300 "m" (yalpha1), "m" (uvalpha1) | |
1301 : "%"REG_a | |
1302 ); | |
1303 break; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1304 case PIX_FMT_BGR565: |
18861 | 1305 asm volatile( |
1306 | |
1307 FULL_YSCALEYUV2RGB | |
1308 #ifdef DITHER1XBPP | |
1309 "paddusb "MANGLE(g6Dither)", %%mm1\n\t" | |
1310 "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
1311 "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
1312 #endif | |
1313 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |
1314 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |
1315 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |
1316 | |
1317 "psrlw $3, %%mm3 \n\t" | |
1318 "psllw $3, %%mm1 \n\t" | |
1319 "psllw $8, %%mm0 \n\t" | |
1320 "pand "MANGLE(g16Mask)", %%mm1 \n\t" | |
1321 "pand "MANGLE(r16Mask)", %%mm0 \n\t" | |
1322 | |
1323 "por %%mm3, %%mm1 \n\t" | |
1324 "por %%mm1, %%mm0 \n\t" | |
1325 | |
1326 MOVNTQ(%%mm0, (%4, %%REGa, 2)) | |
1327 | |
1328 "add $4, %%"REG_a" \n\t" | |
1329 "cmp %5, %%"REG_a" \n\t" | |
1330 " jb 1b \n\t" | |
1331 | |
1332 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), | |
1333 "m" (yalpha1), "m" (uvalpha1) | |
1334 : "%"REG_a | |
1335 ); | |
1336 break; | |
1337 #endif | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1338 case PIX_FMT_BGR32: |
18861 | 1339 #ifndef HAVE_MMX |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1340 case PIX_FMT_RGB32: |
18861 | 1341 #endif |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1342 if(dstFormat==PIX_FMT_RGB32) |
18861 | 1343 { |
1344 int i; | |
1345 #ifdef WORDS_BIGENDIAN | |
1346 dest++; | |
1347 #endif | |
1348 for(i=0;i<dstW;i++){ | |
1349 // vertical linear interpolation && yuv2rgb in a single step: | |
1350 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1351 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1352 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1353 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; | |
1354 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1355 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
1356 dest+= 4; | |
1357 } | |
1358 } | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1359 else if(dstFormat==PIX_FMT_BGR24) |
18861 | 1360 { |
1361 int i; | |
1362 for(i=0;i<dstW;i++){ | |
1363 // vertical linear interpolation && yuv2rgb in a single step: | |
1364 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1365 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1366 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1367 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; | |
1368 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1369 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
1370 dest+= 3; | |
1371 } | |
1372 } | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1373 else if(dstFormat==PIX_FMT_BGR565) |
18861 | 1374 { |
1375 int i; | |
1376 for(i=0;i<dstW;i++){ | |
1377 // vertical linear interpolation && yuv2rgb in a single step: | |
1378 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1379 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1380 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1381 | |
1382 ((uint16_t*)dest)[i] = | |
1383 clip_table16b[(Y + yuvtab_40cf[U]) >>13] | | |
1384 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1385 clip_table16r[(Y + yuvtab_3343[V]) >>13]; | |
1386 } | |
1387 } | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1388 else if(dstFormat==PIX_FMT_BGR555) |
18861 | 1389 { |
1390 int i; | |
1391 for(i=0;i<dstW;i++){ | |
1392 // vertical linear interpolation && yuv2rgb in a single step: | |
1393 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1394 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | |
1395 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); | |
1396 | |
1397 ((uint16_t*)dest)[i] = | |
1398 clip_table15b[(Y + yuvtab_40cf[U]) >>13] | | |
1399 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1400 clip_table15r[(Y + yuvtab_3343[V]) >>13]; | |
1401 } | |
1402 } | |
1403 }//FULL_UV_IPOL | |
1404 else | |
1405 { | |
1406 #endif // if 0 | |
1407 #ifdef HAVE_MMX | |
1408 switch(c->dstFormat) | |
1409 { | |
1410 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1411 case PIX_FMT_RGB32: |
18861 | 1412 asm volatile( |
1413 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1414 "mov %4, %%"REG_b" \n\t" | |
1415 "push %%"REG_BP" \n\t" | |
1416 YSCALEYUV2RGB(%%REGBP, %5) | |
1417 WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1418 "pop %%"REG_BP" \n\t" | |
1419 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1420 | |
1421 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1422 "a" (&c->redDither) | |
1423 ); | |
1424 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1425 case PIX_FMT_BGR24: |
18861 | 1426 asm volatile( |
1427 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1428 "mov %4, %%"REG_b" \n\t" | |
1429 "push %%"REG_BP" \n\t" | |
1430 YSCALEYUV2RGB(%%REGBP, %5) | |
1431 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1432 "pop %%"REG_BP" \n\t" | |
1433 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1434 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1435 "a" (&c->redDither) | |
1436 ); | |
1437 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1438 case PIX_FMT_BGR555: |
18861 | 1439 asm volatile( |
1440 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1441 "mov %4, %%"REG_b" \n\t" | |
1442 "push %%"REG_BP" \n\t" | |
1443 YSCALEYUV2RGB(%%REGBP, %5) | |
1444 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1445 #ifdef DITHER1XBPP | |
1446 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1447 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1448 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1449 #endif | |
1450 | |
1451 WRITEBGR15(%%REGb, 8280(%5), %%REGBP) | |
1452 "pop %%"REG_BP" \n\t" | |
1453 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1454 | |
1455 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1456 "a" (&c->redDither) | |
1457 ); | |
1458 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1459 case PIX_FMT_BGR565: |
18861 | 1460 asm volatile( |
1461 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1462 "mov %4, %%"REG_b" \n\t" | |
1463 "push %%"REG_BP" \n\t" | |
1464 YSCALEYUV2RGB(%%REGBP, %5) | |
1465 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1466 #ifdef DITHER1XBPP | |
1467 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1468 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1469 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1470 #endif | |
1471 | |
1472 WRITEBGR16(%%REGb, 8280(%5), %%REGBP) | |
1473 "pop %%"REG_BP" \n\t" | |
1474 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1475 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1476 "a" (&c->redDither) | |
1477 ); | |
1478 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1479 case PIX_FMT_YUYV422: |
18861 | 1480 asm volatile( |
1481 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1482 "mov %4, %%"REG_b" \n\t" | |
1483 "push %%"REG_BP" \n\t" | |
1484 YSCALEYUV2PACKED(%%REGBP, %5) | |
1485 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1486 "pop %%"REG_BP" \n\t" | |
1487 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1488 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1489 "a" (&c->redDither) | |
1490 ); | |
1491 return; | |
1492 default: break; | |
1493 } | |
1494 #endif //HAVE_MMX | |
1495 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C) | |
1496 } | |
1497 | |
1498 /** | |
1499 * YV12 to RGB without scaling or interpolating | |
1500 */ | |
1501 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, | |
1502 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y) | |
1503 { | |
1504 const int yalpha1=0; | |
1505 int i; | |
1506 | |
1507 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1 | |
1508 const int yalpha= 4096; //FIXME ... | |
1509 | |
1510 if(flags&SWS_FULL_CHR_H_INT) | |
1511 { | |
1512 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y); | |
1513 return; | |
1514 } | |
1515 | |
1516 #ifdef HAVE_MMX | |
1517 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster | |
1518 { | |
1519 switch(dstFormat) | |
1520 { | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1521 case PIX_FMT_RGB32: |
18861 | 1522 asm volatile( |
1523 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1524 "mov %4, %%"REG_b" \n\t" | |
1525 "push %%"REG_BP" \n\t" | |
1526 YSCALEYUV2RGB1(%%REGBP, %5) | |
1527 WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1528 "pop %%"REG_BP" \n\t" | |
1529 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1530 | |
1531 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1532 "a" (&c->redDither) | |
1533 ); | |
1534 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1535 case PIX_FMT_BGR24: |
18861 | 1536 asm volatile( |
1537 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1538 "mov %4, %%"REG_b" \n\t" | |
1539 "push %%"REG_BP" \n\t" | |
1540 YSCALEYUV2RGB1(%%REGBP, %5) | |
1541 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1542 "pop %%"REG_BP" \n\t" | |
1543 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1544 | |
1545 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1546 "a" (&c->redDither) | |
1547 ); | |
1548 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1549 case PIX_FMT_BGR555: |
18861 | 1550 asm volatile( |
1551 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1552 "mov %4, %%"REG_b" \n\t" | |
1553 "push %%"REG_BP" \n\t" | |
1554 YSCALEYUV2RGB1(%%REGBP, %5) | |
1555 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1556 #ifdef DITHER1XBPP | |
1557 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1558 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1559 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1560 #endif | |
1561 WRITEBGR15(%%REGb, 8280(%5), %%REGBP) | |
1562 "pop %%"REG_BP" \n\t" | |
1563 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1564 | |
1565 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1566 "a" (&c->redDither) | |
1567 ); | |
1568 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1569 case PIX_FMT_BGR565: |
18861 | 1570 asm volatile( |
1571 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1572 "mov %4, %%"REG_b" \n\t" | |
1573 "push %%"REG_BP" \n\t" | |
1574 YSCALEYUV2RGB1(%%REGBP, %5) | |
1575 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1576 #ifdef DITHER1XBPP | |
1577 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1578 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1579 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1580 #endif | |
1581 | |
1582 WRITEBGR16(%%REGb, 8280(%5), %%REGBP) | |
1583 "pop %%"REG_BP" \n\t" | |
1584 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1585 | |
1586 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1587 "a" (&c->redDither) | |
1588 ); | |
1589 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1590 case PIX_FMT_YUYV422: |
18861 | 1591 asm volatile( |
1592 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1593 "mov %4, %%"REG_b" \n\t" | |
1594 "push %%"REG_BP" \n\t" | |
1595 YSCALEYUV2PACKED1(%%REGBP, %5) | |
1596 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1597 "pop %%"REG_BP" \n\t" | |
1598 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1599 | |
1600 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1601 "a" (&c->redDither) | |
1602 ); | |
1603 return; | |
1604 } | |
1605 } | |
1606 else | |
1607 { | |
1608 switch(dstFormat) | |
1609 { | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1610 case PIX_FMT_RGB32: |
18861 | 1611 asm volatile( |
1612 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1613 "mov %4, %%"REG_b" \n\t" | |
1614 "push %%"REG_BP" \n\t" | |
1615 YSCALEYUV2RGB1b(%%REGBP, %5) | |
1616 WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | |
1617 "pop %%"REG_BP" \n\t" | |
1618 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1619 | |
1620 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1621 "a" (&c->redDither) | |
1622 ); | |
1623 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1624 case PIX_FMT_BGR24: |
18861 | 1625 asm volatile( |
1626 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1627 "mov %4, %%"REG_b" \n\t" | |
1628 "push %%"REG_BP" \n\t" | |
1629 YSCALEYUV2RGB1b(%%REGBP, %5) | |
1630 WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | |
1631 "pop %%"REG_BP" \n\t" | |
1632 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1633 | |
1634 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1635 "a" (&c->redDither) | |
1636 ); | |
1637 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1638 case PIX_FMT_BGR555: |
18861 | 1639 asm volatile( |
1640 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1641 "mov %4, %%"REG_b" \n\t" | |
1642 "push %%"REG_BP" \n\t" | |
1643 YSCALEYUV2RGB1b(%%REGBP, %5) | |
1644 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1645 #ifdef DITHER1XBPP | |
1646 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1647 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1648 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1649 #endif | |
1650 WRITEBGR15(%%REGb, 8280(%5), %%REGBP) | |
1651 "pop %%"REG_BP" \n\t" | |
1652 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1653 | |
1654 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1655 "a" (&c->redDither) | |
1656 ); | |
1657 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1658 case PIX_FMT_BGR565: |
18861 | 1659 asm volatile( |
1660 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1661 "mov %4, %%"REG_b" \n\t" | |
1662 "push %%"REG_BP" \n\t" | |
1663 YSCALEYUV2RGB1b(%%REGBP, %5) | |
1664 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
1665 #ifdef DITHER1XBPP | |
1666 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | |
1667 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1668 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
1669 #endif | |
1670 | |
1671 WRITEBGR16(%%REGb, 8280(%5), %%REGBP) | |
1672 "pop %%"REG_BP" \n\t" | |
1673 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1674 | |
1675 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1676 "a" (&c->redDither) | |
1677 ); | |
1678 return; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
1679 case PIX_FMT_YUYV422: |
18861 | 1680 asm volatile( |
1681 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |
1682 "mov %4, %%"REG_b" \n\t" | |
1683 "push %%"REG_BP" \n\t" | |
1684 YSCALEYUV2PACKED1b(%%REGBP, %5) | |
1685 WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | |
1686 "pop %%"REG_BP" \n\t" | |
1687 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |
1688 | |
1689 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | |
1690 "a" (&c->redDither) | |
1691 ); | |
1692 return; | |
1693 } | |
1694 } | |
1695 #endif | |
1696 if( uvalpha < 2048 ) | |
1697 { | |
1698 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C) | |
1699 }else{ | |
1700 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C) | |
1701 } | |
1702 } | |
1703 | |
1704 //FIXME yuy2* can read upto 7 samples to much | |
1705 | |
1706 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width) | |
1707 { | |
1708 #ifdef HAVE_MMX | |
1709 asm volatile( | |
1710 "movq "MANGLE(bm01010101)", %%mm2\n\t" | |
1711 "mov %0, %%"REG_a" \n\t" | |
1712 "1: \n\t" | |
1713 "movq (%1, %%"REG_a",2), %%mm0 \n\t" | |
1714 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |
1715 "pand %%mm2, %%mm0 \n\t" | |
1716 "pand %%mm2, %%mm1 \n\t" | |
1717 "packuswb %%mm1, %%mm0 \n\t" | |
1718 "movq %%mm0, (%2, %%"REG_a") \n\t" | |
1719 "add $8, %%"REG_a" \n\t" | |
1720 " js 1b \n\t" | |
1721 : : "g" (-width), "r" (src+width*2), "r" (dst+width) | |
1722 : "%"REG_a | |
1723 ); | |
1724 #else | |
1725 int i; | |
1726 for(i=0; i<width; i++) | |
1727 dst[i]= src[2*i]; | |
1728 #endif | |
1729 } | |
1730 | |
1731 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) | |
1732 { | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1733 #ifdef HAVE_MMX |
18861 | 1734 asm volatile( |
1735 "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
1736 "mov %0, %%"REG_a" \n\t" | |
1737 "1: \n\t" | |
1738 "movq (%1, %%"REG_a",4), %%mm0 \n\t" | |
1739 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | |
1740 "psrlw $8, %%mm0 \n\t" | |
1741 "psrlw $8, %%mm1 \n\t" | |
1742 "packuswb %%mm1, %%mm0 \n\t" | |
1743 "movq %%mm0, %%mm1 \n\t" | |
1744 "psrlw $8, %%mm0 \n\t" | |
1745 "pand %%mm4, %%mm1 \n\t" | |
1746 "packuswb %%mm0, %%mm0 \n\t" | |
1747 "packuswb %%mm1, %%mm1 \n\t" | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1748 "movd %%mm0, (%3, %%"REG_a") \n\t" |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1749 "movd %%mm1, (%2, %%"REG_a") \n\t" |
18861 | 1750 "add $4, %%"REG_a" \n\t" |
1751 " js 1b \n\t" | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1752 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) |
18861 | 1753 : "%"REG_a |
1754 ); | |
1755 #else | |
1756 int i; | |
1757 for(i=0; i<width; i++) | |
1758 { | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1759 dstU[i]= src1[4*i + 1]; |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1760 dstV[i]= src1[4*i + 3]; |
18861 | 1761 } |
1762 #endif | |
21686 | 1763 assert(src1 == src2); |
18861 | 1764 } |
1765 | |
1766 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses | |
1767 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width) | |
1768 { | |
1769 #ifdef HAVE_MMX | |
1770 asm volatile( | |
1771 "mov %0, %%"REG_a" \n\t" | |
1772 "1: \n\t" | |
1773 "movq (%1, %%"REG_a",2), %%mm0 \n\t" | |
1774 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |
1775 "psrlw $8, %%mm0 \n\t" | |
1776 "psrlw $8, %%mm1 \n\t" | |
1777 "packuswb %%mm1, %%mm0 \n\t" | |
1778 "movq %%mm0, (%2, %%"REG_a") \n\t" | |
1779 "add $8, %%"REG_a" \n\t" | |
1780 " js 1b \n\t" | |
1781 : : "g" (-width), "r" (src+width*2), "r" (dst+width) | |
1782 : "%"REG_a | |
1783 ); | |
1784 #else | |
1785 int i; | |
1786 for(i=0; i<width; i++) | |
1787 dst[i]= src[2*i+1]; | |
1788 #endif | |
1789 } | |
1790 | |
1791 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) | |
1792 { | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1793 #ifdef HAVE_MMX |
18861 | 1794 asm volatile( |
1795 "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
1796 "mov %0, %%"REG_a" \n\t" | |
1797 "1: \n\t" | |
1798 "movq (%1, %%"REG_a",4), %%mm0 \n\t" | |
1799 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | |
1800 "pand %%mm4, %%mm0 \n\t" | |
1801 "pand %%mm4, %%mm1 \n\t" | |
1802 "packuswb %%mm1, %%mm0 \n\t" | |
1803 "movq %%mm0, %%mm1 \n\t" | |
1804 "psrlw $8, %%mm0 \n\t" | |
1805 "pand %%mm4, %%mm1 \n\t" | |
1806 "packuswb %%mm0, %%mm0 \n\t" | |
1807 "packuswb %%mm1, %%mm1 \n\t" | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1808 "movd %%mm0, (%3, %%"REG_a") \n\t" |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1809 "movd %%mm1, (%2, %%"REG_a") \n\t" |
18861 | 1810 "add $4, %%"REG_a" \n\t" |
1811 " js 1b \n\t" | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1812 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) |
18861 | 1813 : "%"REG_a |
1814 ); | |
1815 #else | |
1816 int i; | |
1817 for(i=0; i<width; i++) | |
1818 { | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1819 dstU[i]= src1[4*i + 0]; |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1820 dstV[i]= src1[4*i + 2]; |
18861 | 1821 } |
1822 #endif | |
21686 | 1823 assert(src1 == src2); |
18861 | 1824 } |
1825 | |
1826 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width) | |
1827 { | |
1828 int i; | |
1829 for(i=0; i<width; i++) | |
1830 { | |
1831 int b= ((uint32_t*)src)[i]&0xFF; | |
1832 int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
1833 int r= (((uint32_t*)src)[i]>>16)&0xFF; | |
1834 | |
1835 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); | |
1836 } | |
1837 } | |
1838 | |
1839 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1840 { | |
1841 int i; | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1842 assert(src1 == src2); |
18861 | 1843 for(i=0; i<width; i++) |
1844 { | |
1845 const int a= ((uint32_t*)src1)[2*i+0]; | |
1846 const int e= ((uint32_t*)src1)[2*i+1]; | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1847 const int l= (a&0xFF00FF) + (e&0xFF00FF); |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1848 const int h= (a&0x00FF00) + (e&0x00FF00); |
18861 | 1849 const int b= l&0x3FF; |
1850 const int g= h>>8; | |
1851 const int r= l>>16; | |
1852 | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1853 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128; |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1854 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128; |
18861 | 1855 } |
1856 } | |
1857 | |
1858 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width) | |
1859 { | |
1860 #ifdef HAVE_MMX | |
1861 asm volatile( | |
1862 "mov %2, %%"REG_a" \n\t" | |
1863 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" | |
1864 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
1865 "pxor %%mm7, %%mm7 \n\t" | |
19396 | 1866 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t" |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
1867 ASMALIGN(4) |
18861 | 1868 "1: \n\t" |
19396 | 1869 PREFETCH" 64(%0, %%"REG_d") \n\t" |
1870 "movd (%0, %%"REG_d"), %%mm0 \n\t" | |
1871 "movd 3(%0, %%"REG_d"), %%mm1 \n\t" | |
18861 | 1872 "punpcklbw %%mm7, %%mm0 \n\t" |
1873 "punpcklbw %%mm7, %%mm1 \n\t" | |
19396 | 1874 "movd 6(%0, %%"REG_d"), %%mm2 \n\t" |
1875 "movd 9(%0, %%"REG_d"), %%mm3 \n\t" | |
18861 | 1876 "punpcklbw %%mm7, %%mm2 \n\t" |
1877 "punpcklbw %%mm7, %%mm3 \n\t" | |
1878 "pmaddwd %%mm6, %%mm0 \n\t" | |
1879 "pmaddwd %%mm6, %%mm1 \n\t" | |
1880 "pmaddwd %%mm6, %%mm2 \n\t" | |
1881 "pmaddwd %%mm6, %%mm3 \n\t" | |
1882 #ifndef FAST_BGR2YV12 | |
1883 "psrad $8, %%mm0 \n\t" | |
1884 "psrad $8, %%mm1 \n\t" | |
1885 "psrad $8, %%mm2 \n\t" | |
1886 "psrad $8, %%mm3 \n\t" | |
1887 #endif | |
1888 "packssdw %%mm1, %%mm0 \n\t" | |
1889 "packssdw %%mm3, %%mm2 \n\t" | |
1890 "pmaddwd %%mm5, %%mm0 \n\t" | |
1891 "pmaddwd %%mm5, %%mm2 \n\t" | |
1892 "packssdw %%mm2, %%mm0 \n\t" | |
1893 "psraw $7, %%mm0 \n\t" | |
1894 | |
19396 | 1895 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" |
1896 "movd 15(%0, %%"REG_d"), %%mm1 \n\t" | |
18861 | 1897 "punpcklbw %%mm7, %%mm4 \n\t" |
1898 "punpcklbw %%mm7, %%mm1 \n\t" | |
19396 | 1899 "movd 18(%0, %%"REG_d"), %%mm2 \n\t" |
1900 "movd 21(%0, %%"REG_d"), %%mm3 \n\t" | |
18861 | 1901 "punpcklbw %%mm7, %%mm2 \n\t" |
1902 "punpcklbw %%mm7, %%mm3 \n\t" | |
1903 "pmaddwd %%mm6, %%mm4 \n\t" | |
1904 "pmaddwd %%mm6, %%mm1 \n\t" | |
1905 "pmaddwd %%mm6, %%mm2 \n\t" | |
1906 "pmaddwd %%mm6, %%mm3 \n\t" | |
1907 #ifndef FAST_BGR2YV12 | |
1908 "psrad $8, %%mm4 \n\t" | |
1909 "psrad $8, %%mm1 \n\t" | |
1910 "psrad $8, %%mm2 \n\t" | |
1911 "psrad $8, %%mm3 \n\t" | |
1912 #endif | |
1913 "packssdw %%mm1, %%mm4 \n\t" | |
1914 "packssdw %%mm3, %%mm2 \n\t" | |
1915 "pmaddwd %%mm5, %%mm4 \n\t" | |
1916 "pmaddwd %%mm5, %%mm2 \n\t" | |
19396 | 1917 "add $24, %%"REG_d" \n\t" |
18861 | 1918 "packssdw %%mm2, %%mm4 \n\t" |
1919 "psraw $7, %%mm4 \n\t" | |
1920 | |
1921 "packuswb %%mm4, %%mm0 \n\t" | |
1922 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" | |
1923 | |
1924 "movq %%mm0, (%1, %%"REG_a") \n\t" | |
1925 "add $8, %%"REG_a" \n\t" | |
1926 " js 1b \n\t" | |
1927 : : "r" (src+width*3), "r" (dst+width), "g" (-width) | |
19396 | 1928 : "%"REG_a, "%"REG_d |
18861 | 1929 ); |
1930 #else | |
1931 int i; | |
1932 for(i=0; i<width; i++) | |
1933 { | |
1934 int b= src[i*3+0]; | |
1935 int g= src[i*3+1]; | |
1936 int r= src[i*3+2]; | |
1937 | |
1938 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); | |
1939 } | |
1940 #endif | |
1941 } | |
1942 | |
1943 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) | |
1944 { | |
1945 #ifdef HAVE_MMX | |
1946 asm volatile( | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1947 "mov %3, %%"REG_a" \n\t" |
18861 | 1948 "movq "MANGLE(w1111)", %%mm5 \n\t" |
1949 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" | |
1950 "pxor %%mm7, %%mm7 \n\t" | |
19396 | 1951 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" |
1952 "add %%"REG_d", %%"REG_d" \n\t" | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
1953 ASMALIGN(4) |
18861 | 1954 "1: \n\t" |
19396 | 1955 PREFETCH" 64(%0, %%"REG_d") \n\t" |
18861 | 1956 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
19396 | 1957 "movq (%0, %%"REG_d"), %%mm0 \n\t" |
1958 "movq 6(%0, %%"REG_d"), %%mm2 \n\t" | |
18861 | 1959 "movq %%mm0, %%mm1 \n\t" |
1960 "movq %%mm2, %%mm3 \n\t" | |
1961 "psrlq $24, %%mm0 \n\t" | |
1962 "psrlq $24, %%mm2 \n\t" | |
1963 PAVGB(%%mm1, %%mm0) | |
1964 PAVGB(%%mm3, %%mm2) | |
1965 "punpcklbw %%mm7, %%mm0 \n\t" | |
1966 "punpcklbw %%mm7, %%mm2 \n\t" | |
1967 #else | |
19396 | 1968 "movd (%0, %%"REG_d"), %%mm0 \n\t" |
1969 "movd 3(%0, %%"REG_d"), %%mm2 \n\t" | |
18861 | 1970 "punpcklbw %%mm7, %%mm0 \n\t" |
1971 "punpcklbw %%mm7, %%mm2 \n\t" | |
1972 "paddw %%mm2, %%mm0 \n\t" | |
19396 | 1973 "movd 6(%0, %%"REG_d"), %%mm4 \n\t" |
1974 "movd 9(%0, %%"REG_d"), %%mm2 \n\t" | |
18861 | 1975 "punpcklbw %%mm7, %%mm4 \n\t" |
1976 "punpcklbw %%mm7, %%mm2 \n\t" | |
1977 "paddw %%mm4, %%mm2 \n\t" | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1978 "psrlw $1, %%mm0 \n\t" |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
1979 "psrlw $1, %%mm2 \n\t" |
18861 | 1980 #endif |
1981 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" | |
1982 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
1983 | |
1984 "pmaddwd %%mm0, %%mm1 \n\t" | |
1985 "pmaddwd %%mm2, %%mm3 \n\t" | |
1986 "pmaddwd %%mm6, %%mm0 \n\t" | |
1987 "pmaddwd %%mm6, %%mm2 \n\t" | |
1988 #ifndef FAST_BGR2YV12 | |
1989 "psrad $8, %%mm0 \n\t" | |
1990 "psrad $8, %%mm1 \n\t" | |
1991 "psrad $8, %%mm2 \n\t" | |
1992 "psrad $8, %%mm3 \n\t" | |
1993 #endif | |
1994 "packssdw %%mm2, %%mm0 \n\t" | |
1995 "packssdw %%mm3, %%mm1 \n\t" | |
1996 "pmaddwd %%mm5, %%mm0 \n\t" | |
1997 "pmaddwd %%mm5, %%mm1 \n\t" | |
1998 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
1999 "psraw $7, %%mm0 \n\t" | |
2000 | |
2001 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
19396 | 2002 "movq 12(%0, %%"REG_d"), %%mm4 \n\t" |
2003 "movq 18(%0, %%"REG_d"), %%mm2 \n\t" | |
18861 | 2004 "movq %%mm4, %%mm1 \n\t" |
2005 "movq %%mm2, %%mm3 \n\t" | |
2006 "psrlq $24, %%mm4 \n\t" | |
2007 "psrlq $24, %%mm2 \n\t" | |
2008 PAVGB(%%mm1, %%mm4) | |
2009 PAVGB(%%mm3, %%mm2) | |
2010 "punpcklbw %%mm7, %%mm4 \n\t" | |
2011 "punpcklbw %%mm7, %%mm2 \n\t" | |
2012 #else | |
19396 | 2013 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" |
2014 "movd 15(%0, %%"REG_d"), %%mm2 \n\t" | |
18861 | 2015 "punpcklbw %%mm7, %%mm4 \n\t" |
2016 "punpcklbw %%mm7, %%mm2 \n\t" | |
2017 "paddw %%mm2, %%mm4 \n\t" | |
19396 | 2018 "movd 18(%0, %%"REG_d"), %%mm5 \n\t" |
2019 "movd 21(%0, %%"REG_d"), %%mm2 \n\t" | |
18861 | 2020 "punpcklbw %%mm7, %%mm5 \n\t" |
2021 "punpcklbw %%mm7, %%mm2 \n\t" | |
2022 "paddw %%mm5, %%mm2 \n\t" | |
2023 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
2024 "psrlw $2, %%mm4 \n\t" | |
2025 "psrlw $2, %%mm2 \n\t" | |
2026 #endif | |
2027 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" | |
2028 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
2029 | |
2030 "pmaddwd %%mm4, %%mm1 \n\t" | |
2031 "pmaddwd %%mm2, %%mm3 \n\t" | |
2032 "pmaddwd %%mm6, %%mm4 \n\t" | |
2033 "pmaddwd %%mm6, %%mm2 \n\t" | |
2034 #ifndef FAST_BGR2YV12 | |
2035 "psrad $8, %%mm4 \n\t" | |
2036 "psrad $8, %%mm1 \n\t" | |
2037 "psrad $8, %%mm2 \n\t" | |
2038 "psrad $8, %%mm3 \n\t" | |
2039 #endif | |
2040 "packssdw %%mm2, %%mm4 \n\t" | |
2041 "packssdw %%mm3, %%mm1 \n\t" | |
2042 "pmaddwd %%mm5, %%mm4 \n\t" | |
2043 "pmaddwd %%mm5, %%mm1 \n\t" | |
19396 | 2044 "add $24, %%"REG_d" \n\t" |
18861 | 2045 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 |
2046 "psraw $7, %%mm4 \n\t" | |
2047 | |
2048 "movq %%mm0, %%mm1 \n\t" | |
2049 "punpckldq %%mm4, %%mm0 \n\t" | |
2050 "punpckhdq %%mm4, %%mm1 \n\t" | |
2051 "packsswb %%mm1, %%mm0 \n\t" | |
2052 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" | |
2053 | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2054 "movd %%mm0, (%1, %%"REG_a") \n\t" |
18861 | 2055 "punpckhdq %%mm0, %%mm0 \n\t" |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2056 "movd %%mm0, (%2, %%"REG_a") \n\t" |
18861 | 2057 "add $4, %%"REG_a" \n\t" |
2058 " js 1b \n\t" | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2059 : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width) |
19396 | 2060 : "%"REG_a, "%"REG_d |
18861 | 2061 ); |
2062 #else | |
2063 int i; | |
2064 for(i=0; i<width; i++) | |
2065 { | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2066 int b= src1[6*i + 0] + src1[6*i + 3]; |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2067 int g= src1[6*i + 1] + src1[6*i + 4]; |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2068 int r= src1[6*i + 2] + src1[6*i + 5]; |
18861 | 2069 |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2070 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128; |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2071 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128; |
18861 | 2072 } |
2073 #endif | |
21686 | 2074 assert(src1 == src2); |
18861 | 2075 } |
2076 | |
2077 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width) | |
2078 { | |
2079 int i; | |
2080 for(i=0; i<width; i++) | |
2081 { | |
2082 int d= ((uint16_t*)src)[i]; | |
2083 int b= d&0x1F; | |
2084 int g= (d>>5)&0x3F; | |
2085 int r= (d>>11)&0x1F; | |
2086 | |
2087 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16; | |
2088 } | |
2089 } | |
2090 | |
2091 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2092 { | |
20946
bb4c952bc52c
forgotten 2 converters (yeah svn up, svn di svn ci isnt enough i should actually look at the code after svn up not just the diff ...)
michael
parents:
20945
diff
changeset
|
2093 int i; |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2094 assert(src1==src2); |
18861 | 2095 for(i=0; i<width; i++) |
2096 { | |
2097 int d0= ((uint32_t*)src1)[i]; | |
2098 | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2099 int dl= (d0&0x07E0F81F); |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2100 int dh= ((d0>>5)&0x07C0F83F); |
18861 | 2101 |
2102 int dh2= (dh>>11) + (dh<<21); | |
2103 int d= dh2 + dl; | |
2104 | |
2105 int b= d&0x7F; | |
2106 int r= (d>>11)&0x7F; | |
2107 int g= d>>21; | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2108 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128; |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2109 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128; |
18861 | 2110 } |
2111 } | |
2112 | |
2113 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width) | |
2114 { | |
2115 int i; | |
2116 for(i=0; i<width; i++) | |
2117 { | |
2118 int d= ((uint16_t*)src)[i]; | |
2119 int b= d&0x1F; | |
2120 int g= (d>>5)&0x1F; | |
2121 int r= (d>>10)&0x1F; | |
2122 | |
2123 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16; | |
2124 } | |
2125 } | |
2126 | |
2127 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2128 { | |
2129 int i; | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2130 assert(src1==src2); |
18861 | 2131 for(i=0; i<width; i++) |
2132 { | |
2133 int d0= ((uint32_t*)src1)[i]; | |
2134 | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2135 int dl= (d0&0x03E07C1F); |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2136 int dh= ((d0>>5)&0x03E0F81F); |
18861 | 2137 |
2138 int dh2= (dh>>11) + (dh<<21); | |
2139 int d= dh2 + dl; | |
2140 | |
2141 int b= d&0x7F; | |
2142 int r= (d>>10)&0x7F; | |
2143 int g= d>>21; | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2144 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128; |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2145 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128; |
18861 | 2146 } |
2147 } | |
2148 | |
2149 | |
2150 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width) | |
2151 { | |
2152 int i; | |
2153 for(i=0; i<width; i++) | |
2154 { | |
2155 int r= ((uint32_t*)src)[i]&0xFF; | |
2156 int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
2157 int b= (((uint32_t*)src)[i]>>16)&0xFF; | |
2158 | |
2159 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); | |
2160 } | |
2161 } | |
2162 | |
2163 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2164 { | |
2165 int i; | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2166 assert(src1==src2); |
18861 | 2167 for(i=0; i<width; i++) |
2168 { | |
2169 const int a= ((uint32_t*)src1)[2*i+0]; | |
2170 const int e= ((uint32_t*)src1)[2*i+1]; | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2171 const int l= (a&0xFF00FF) + (e&0xFF00FF); |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2172 const int h= (a&0x00FF00) + (e&0x00FF00); |
18861 | 2173 const int r= l&0x3FF; |
2174 const int g= h>>8; | |
2175 const int b= l>>16; | |
2176 | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2177 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128; |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2178 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128; |
18861 | 2179 } |
2180 } | |
2181 | |
2182 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width) | |
2183 { | |
2184 int i; | |
2185 for(i=0; i<width; i++) | |
2186 { | |
2187 int r= src[i*3+0]; | |
2188 int g= src[i*3+1]; | |
2189 int b= src[i*3+2]; | |
2190 | |
2191 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); | |
2192 } | |
2193 } | |
2194 | |
2195 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2196 { | |
2197 int i; | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2198 assert(src1==src2); |
18861 | 2199 for(i=0; i<width; i++) |
2200 { | |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2201 int r= src1[6*i + 0] + src1[6*i + 3]; |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2202 int g= src1[6*i + 1] + src1[6*i + 4]; |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2203 int b= src1[6*i + 2] + src1[6*i + 5]; |
18861 | 2204 |
20945
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2205 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128; |
92150c16e737
fixing the lamest bug in swscale, all the rgb/bgr->* code did 2x2 downsampling for chroma, it should just be 2x1 (the rest of the code also belived its 2x1 ...)
michael
parents:
20724
diff
changeset
|
2206 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128; |
18861 | 2207 } |
2208 } | |
2209 | |
20589
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2210 static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width) |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2211 { |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2212 int i; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2213 for(i=0; i<width; i++) |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2214 { |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2215 int d= ((uint16_t*)src)[i]; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2216 int r= d&0x1F; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2217 int g= (d>>5)&0x3F; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2218 int b= (d>>11)&0x1F; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2219 |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2220 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2221 } |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2222 } |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2223 |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2224 static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2225 { |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2226 int i; |
20946
bb4c952bc52c
forgotten 2 converters (yeah svn up, svn di svn ci isnt enough i should actually look at the code after svn up not just the diff ...)
michael
parents:
20945
diff
changeset
|
2227 assert(src1 == src2); |
20589
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2228 for(i=0; i<width; i++) |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2229 { |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2230 int d0= ((uint32_t*)src1)[i]; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2231 |
20946
bb4c952bc52c
forgotten 2 converters (yeah svn up, svn di svn ci isnt enough i should actually look at the code after svn up not just the diff ...)
michael
parents:
20945
diff
changeset
|
2232 int dl= (d0&0x07E0F81F); |
bb4c952bc52c
forgotten 2 converters (yeah svn up, svn di svn ci isnt enough i should actually look at the code after svn up not just the diff ...)
michael
parents:
20945
diff
changeset
|
2233 int dh= ((d0>>5)&0x07C0F83F); |
20589
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2234 |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2235 int dh2= (dh>>11) + (dh<<21); |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2236 int d= dh2 + dl; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2237 |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2238 int r= d&0x7F; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2239 int b= (d>>11)&0x7F; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2240 int g= d>>21; |
20946
bb4c952bc52c
forgotten 2 converters (yeah svn up, svn di svn ci isnt enough i should actually look at the code after svn up not just the diff ...)
michael
parents:
20945
diff
changeset
|
2241 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128; |
bb4c952bc52c
forgotten 2 converters (yeah svn up, svn di svn ci isnt enough i should actually look at the code after svn up not just the diff ...)
michael
parents:
20945
diff
changeset
|
2242 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128; |
20589
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2243 } |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2244 } |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2245 |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2246 static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width) |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2247 { |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2248 int i; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2249 for(i=0; i<width; i++) |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2250 { |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2251 int d= ((uint16_t*)src)[i]; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2252 int r= d&0x1F; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2253 int g= (d>>5)&0x1F; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2254 int b= (d>>10)&0x1F; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2255 |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2256 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2257 } |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2258 } |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2259 |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2260 static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2261 { |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2262 int i; |
20946
bb4c952bc52c
forgotten 2 converters (yeah svn up, svn di svn ci isnt enough i should actually look at the code after svn up not just the diff ...)
michael
parents:
20945
diff
changeset
|
2263 assert(src1 == src2); |
20589
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2264 for(i=0; i<width; i++) |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2265 { |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2266 int d0= ((uint32_t*)src1)[i]; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2267 |
20946
bb4c952bc52c
forgotten 2 converters (yeah svn up, svn di svn ci isnt enough i should actually look at the code after svn up not just the diff ...)
michael
parents:
20945
diff
changeset
|
2268 int dl= (d0&0x03E07C1F); |
bb4c952bc52c
forgotten 2 converters (yeah svn up, svn di svn ci isnt enough i should actually look at the code after svn up not just the diff ...)
michael
parents:
20945
diff
changeset
|
2269 int dh= ((d0>>5)&0x03E0F81F); |
20589
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2270 |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2271 int dh2= (dh>>11) + (dh<<21); |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2272 int d= dh2 + dl; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2273 |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2274 int g= d&0x7F; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2275 int r= (d>>10)&0x7F; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2276 int b= d>>21; |
20946
bb4c952bc52c
forgotten 2 converters (yeah svn up, svn di svn ci isnt enough i should actually look at the code after svn up not just the diff ...)
michael
parents:
20945
diff
changeset
|
2277 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128; |
bb4c952bc52c
forgotten 2 converters (yeah svn up, svn di svn ci isnt enough i should actually look at the code after svn up not just the diff ...)
michael
parents:
20945
diff
changeset
|
2278 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128; |
20589
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2279 } |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2280 } |
18861 | 2281 |
22218 | 2282 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal) |
2283 { | |
2284 int i; | |
2285 for(i=0; i<width; i++) | |
2286 { | |
2287 int d= src[i]; | |
2288 int b= pal[d] &0xFF; | |
2289 int g=(pal[d]>>8 )&0xFF; | |
2290 int r= pal[d]>>16; | |
2291 | |
2292 dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; | |
2293 } | |
2294 } | |
2295 | |
2296 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal) | |
2297 { | |
2298 int i; | |
2299 assert(src1 == src2); | |
2300 for(i=0; i<width; i++) | |
2301 { | |
2302 int d0= src1[2*i ]; | |
2303 int d1= src1[2*i+1]; | |
2304 int p = (pal[d0]&0xFF00FF) + (pal[d1]&0xFF00FF); | |
2305 int g = (pal[d0]+pal[d1]-p)>>8; | |
2306 int b= p&0x1FF; | |
2307 int r= p>>16; | |
2308 | |
2309 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128; | |
2310 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128; | |
2311 } | |
2312 } | |
2313 | |
18861 | 2314 // Bilinear / Bicubic scaling |
2315 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, | |
2316 int16_t *filter, int16_t *filterPos, long filterSize) | |
2317 { | |
2318 #ifdef HAVE_MMX | |
2319 assert(filterSize % 4 == 0 && filterSize>0); | |
2320 if(filterSize==4) // allways true for upscaling, sometimes for down too | |
2321 { | |
2322 long counter= -2*dstW; | |
2323 filter-= counter*2; | |
2324 filterPos-= counter/2; | |
2325 dst-= counter/2; | |
2326 asm volatile( | |
19396 | 2327 #if defined(PIC) |
2328 "push %%"REG_b" \n\t" | |
2329 #endif | |
18861 | 2330 "pxor %%mm7, %%mm7 \n\t" |
2331 "movq "MANGLE(w02)", %%mm6 \n\t" | |
2332 "push %%"REG_BP" \n\t" // we use 7 regs here ... | |
2333 "mov %%"REG_a", %%"REG_BP" \n\t" | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
2334 ASMALIGN(4) |
18861 | 2335 "1: \n\t" |
2336 "movzwl (%2, %%"REG_BP"), %%eax \n\t" | |
2337 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t" | |
2338 "movq (%1, %%"REG_BP", 4), %%mm1\n\t" | |
2339 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t" | |
2340 "movd (%3, %%"REG_a"), %%mm0 \n\t" | |
2341 "movd (%3, %%"REG_b"), %%mm2 \n\t" | |
2342 "punpcklbw %%mm7, %%mm0 \n\t" | |
2343 "punpcklbw %%mm7, %%mm2 \n\t" | |
2344 "pmaddwd %%mm1, %%mm0 \n\t" | |
2345 "pmaddwd %%mm2, %%mm3 \n\t" | |
2346 "psrad $8, %%mm0 \n\t" | |
2347 "psrad $8, %%mm3 \n\t" | |
2348 "packssdw %%mm3, %%mm0 \n\t" | |
2349 "pmaddwd %%mm6, %%mm0 \n\t" | |
2350 "packssdw %%mm0, %%mm0 \n\t" | |
2351 "movd %%mm0, (%4, %%"REG_BP") \n\t" | |
2352 "add $4, %%"REG_BP" \n\t" | |
2353 " jnc 1b \n\t" | |
2354 | |
2355 "pop %%"REG_BP" \n\t" | |
19396 | 2356 #if defined(PIC) |
2357 "pop %%"REG_b" \n\t" | |
2358 #endif | |
18861 | 2359 : "+a" (counter) |
2360 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
19396 | 2361 #if !defined(PIC) |
18861 | 2362 : "%"REG_b |
19396 | 2363 #endif |
18861 | 2364 ); |
2365 } | |
2366 else if(filterSize==8) | |
2367 { | |
2368 long counter= -2*dstW; | |
2369 filter-= counter*4; | |
2370 filterPos-= counter/2; | |
2371 dst-= counter/2; | |
2372 asm volatile( | |
19396 | 2373 #if defined(PIC) |
2374 "push %%"REG_b" \n\t" | |
2375 #endif | |
18861 | 2376 "pxor %%mm7, %%mm7 \n\t" |
2377 "movq "MANGLE(w02)", %%mm6 \n\t" | |
2378 "push %%"REG_BP" \n\t" // we use 7 regs here ... | |
2379 "mov %%"REG_a", %%"REG_BP" \n\t" | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
2380 ASMALIGN(4) |
18861 | 2381 "1: \n\t" |
2382 "movzwl (%2, %%"REG_BP"), %%eax \n\t" | |
2383 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t" | |
2384 "movq (%1, %%"REG_BP", 8), %%mm1\n\t" | |
2385 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t" | |
2386 "movd (%3, %%"REG_a"), %%mm0 \n\t" | |
2387 "movd (%3, %%"REG_b"), %%mm2 \n\t" | |
2388 "punpcklbw %%mm7, %%mm0 \n\t" | |
2389 "punpcklbw %%mm7, %%mm2 \n\t" | |
2390 "pmaddwd %%mm1, %%mm0 \n\t" | |
2391 "pmaddwd %%mm2, %%mm3 \n\t" | |
2392 | |
2393 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t" | |
2394 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t" | |
2395 "movd 4(%3, %%"REG_a"), %%mm4 \n\t" | |
2396 "movd 4(%3, %%"REG_b"), %%mm2 \n\t" | |
2397 "punpcklbw %%mm7, %%mm4 \n\t" | |
2398 "punpcklbw %%mm7, %%mm2 \n\t" | |
2399 "pmaddwd %%mm1, %%mm4 \n\t" | |
2400 "pmaddwd %%mm2, %%mm5 \n\t" | |
2401 "paddd %%mm4, %%mm0 \n\t" | |
2402 "paddd %%mm5, %%mm3 \n\t" | |
2403 | |
2404 "psrad $8, %%mm0 \n\t" | |
2405 "psrad $8, %%mm3 \n\t" | |
2406 "packssdw %%mm3, %%mm0 \n\t" | |
2407 "pmaddwd %%mm6, %%mm0 \n\t" | |
2408 "packssdw %%mm0, %%mm0 \n\t" | |
2409 "movd %%mm0, (%4, %%"REG_BP") \n\t" | |
2410 "add $4, %%"REG_BP" \n\t" | |
2411 " jnc 1b \n\t" | |
2412 | |
2413 "pop %%"REG_BP" \n\t" | |
19396 | 2414 #if defined(PIC) |
2415 "pop %%"REG_b" \n\t" | |
2416 #endif | |
18861 | 2417 : "+a" (counter) |
2418 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
19396 | 2419 #if !defined(PIC) |
18861 | 2420 : "%"REG_b |
19396 | 2421 #endif |
18861 | 2422 ); |
2423 } | |
2424 else | |
2425 { | |
2426 uint8_t *offset = src+filterSize; | |
2427 long counter= -2*dstW; | |
2428 // filter-= counter*filterSize/2; | |
2429 filterPos-= counter/2; | |
2430 dst-= counter/2; | |
2431 asm volatile( | |
2432 "pxor %%mm7, %%mm7 \n\t" | |
2433 "movq "MANGLE(w02)", %%mm6 \n\t" | |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
2434 ASMALIGN(4) |
18861 | 2435 "1: \n\t" |
2436 "mov %2, %%"REG_c" \n\t" | |
2437 "movzwl (%%"REG_c", %0), %%eax \n\t" | |
19396 | 2438 "movzwl 2(%%"REG_c", %0), %%edx \n\t" |
18861 | 2439 "mov %5, %%"REG_c" \n\t" |
2440 "pxor %%mm4, %%mm4 \n\t" | |
2441 "pxor %%mm5, %%mm5 \n\t" | |
2442 "2: \n\t" | |
2443 "movq (%1), %%mm1 \n\t" | |
2444 "movq (%1, %6), %%mm3 \n\t" | |
2445 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t" | |
19396 | 2446 "movd (%%"REG_c", %%"REG_d"), %%mm2\n\t" |
18861 | 2447 "punpcklbw %%mm7, %%mm0 \n\t" |
2448 "punpcklbw %%mm7, %%mm2 \n\t" | |
2449 "pmaddwd %%mm1, %%mm0 \n\t" | |
2450 "pmaddwd %%mm2, %%mm3 \n\t" | |
2451 "paddd %%mm3, %%mm5 \n\t" | |
2452 "paddd %%mm0, %%mm4 \n\t" | |
2453 "add $8, %1 \n\t" | |
2454 "add $4, %%"REG_c" \n\t" | |
2455 "cmp %4, %%"REG_c" \n\t" | |
2456 " jb 2b \n\t" | |
2457 "add %6, %1 \n\t" | |
2458 "psrad $8, %%mm4 \n\t" | |
2459 "psrad $8, %%mm5 \n\t" | |
2460 "packssdw %%mm5, %%mm4 \n\t" | |
2461 "pmaddwd %%mm6, %%mm4 \n\t" | |
2462 "packssdw %%mm4, %%mm4 \n\t" | |
2463 "mov %3, %%"REG_a" \n\t" | |
2464 "movd %%mm4, (%%"REG_a", %0) \n\t" | |
2465 "add $4, %0 \n\t" | |
2466 " jnc 1b \n\t" | |
2467 | |
2468 : "+r" (counter), "+r" (filter) | |
2469 : "m" (filterPos), "m" (dst), "m"(offset), | |
2470 "m" (src), "r" (filterSize*2) | |
19396 | 2471 : "%"REG_a, "%"REG_c, "%"REG_d |
18861 | 2472 ); |
2473 } | |
2474 #else | |
2475 #ifdef HAVE_ALTIVEC | |
2476 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize); | |
2477 #else | |
2478 int i; | |
2479 for(i=0; i<dstW; i++) | |
2480 { | |
2481 int j; | |
2482 int srcPos= filterPos[i]; | |
2483 int val=0; | |
2484 // printf("filterPos: %d\n", filterPos[i]); | |
2485 for(j=0; j<filterSize; j++) | |
2486 { | |
2487 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); | |
2488 val += ((int)src[srcPos + j])*filter[filterSize*i + j]; | |
2489 } | |
2490 // filter += hFilterSize; | |
22321 | 2491 dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ... |
18861 | 2492 // dst[i] = val>>7; |
2493 } | |
2494 #endif | |
2495 #endif | |
2496 } | |
2497 // *** horizontal scale Y line to temp buffer | |
2498 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc, | |
2499 int flags, int canMMX2BeUsed, int16_t *hLumFilter, | |
2500 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, | |
2501 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, | |
22218 | 2502 int32_t *mmx2FilterPos, uint8_t *pal) |
18861 | 2503 { |
20411 | 2504 if(srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE) |
18861 | 2505 { |
2506 RENAME(yuy2ToY)(formatConvBuffer, src, srcW); | |
2507 src= formatConvBuffer; | |
2508 } | |
20411 | 2509 else if(srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE) |
18861 | 2510 { |
2511 RENAME(uyvyToY)(formatConvBuffer, src, srcW); | |
2512 src= formatConvBuffer; | |
2513 } | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
2514 else if(srcFormat==PIX_FMT_RGB32) |
18861 | 2515 { |
2516 RENAME(bgr32ToY)(formatConvBuffer, src, srcW); | |
2517 src= formatConvBuffer; | |
2518 } | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
2519 else if(srcFormat==PIX_FMT_BGR24) |
18861 | 2520 { |
2521 RENAME(bgr24ToY)(formatConvBuffer, src, srcW); | |
2522 src= formatConvBuffer; | |
2523 } | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
2524 else if(srcFormat==PIX_FMT_BGR565) |
18861 | 2525 { |
2526 RENAME(bgr16ToY)(formatConvBuffer, src, srcW); | |
2527 src= formatConvBuffer; | |
2528 } | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
2529 else if(srcFormat==PIX_FMT_BGR555) |
18861 | 2530 { |
2531 RENAME(bgr15ToY)(formatConvBuffer, src, srcW); | |
2532 src= formatConvBuffer; | |
2533 } | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
2534 else if(srcFormat==PIX_FMT_BGR32) |
18861 | 2535 { |
2536 RENAME(rgb32ToY)(formatConvBuffer, src, srcW); | |
2537 src= formatConvBuffer; | |
2538 } | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
2539 else if(srcFormat==PIX_FMT_RGB24) |
18861 | 2540 { |
2541 RENAME(rgb24ToY)(formatConvBuffer, src, srcW); | |
2542 src= formatConvBuffer; | |
2543 } | |
20589
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2544 else if(srcFormat==PIX_FMT_RGB565) |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2545 { |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2546 RENAME(rgb16ToY)(formatConvBuffer, src, srcW); |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2547 src= formatConvBuffer; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2548 } |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2549 else if(srcFormat==PIX_FMT_RGB555) |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2550 { |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2551 RENAME(rgb15ToY)(formatConvBuffer, src, srcW); |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2552 src= formatConvBuffer; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2553 } |
22226 | 2554 else if(srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE) |
22218 | 2555 { |
2556 RENAME(palToY)(formatConvBuffer, src, srcW, pal); | |
2557 src= formatConvBuffer; | |
2558 } | |
18861 | 2559 |
2560 #ifdef HAVE_MMX | |
2561 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one) | |
2562 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) | |
2563 #else | |
2564 if(!(flags&SWS_FAST_BILINEAR)) | |
2565 #endif | |
2566 { | |
2567 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); | |
2568 } | |
2569 else // Fast Bilinear upscale / crap downscale | |
2570 { | |
20576 | 2571 #if defined(ARCH_X86) |
18861 | 2572 #ifdef HAVE_MMX2 |
2573 int i; | |
19396 | 2574 #if defined(PIC) |
2575 uint64_t ebxsave __attribute__((aligned(8))); | |
2576 #endif | |
18861 | 2577 if(canMMX2BeUsed) |
2578 { | |
2579 asm volatile( | |
19396 | 2580 #if defined(PIC) |
2581 "mov %%"REG_b", %5 \n\t" | |
2582 #endif | |
18861 | 2583 "pxor %%mm7, %%mm7 \n\t" |
2584 "mov %0, %%"REG_c" \n\t" | |
2585 "mov %1, %%"REG_D" \n\t" | |
2586 "mov %2, %%"REG_d" \n\t" | |
2587 "mov %3, %%"REG_b" \n\t" | |
2588 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2589 PREFETCH" (%%"REG_c") \n\t" | |
2590 PREFETCH" 32(%%"REG_c") \n\t" | |
2591 PREFETCH" 64(%%"REG_c") \n\t" | |
2592 | |
2593 #ifdef ARCH_X86_64 | |
2594 | |
2595 #define FUNNY_Y_CODE \ | |
2596 "movl (%%"REG_b"), %%esi \n\t"\ | |
2597 "call *%4 \n\t"\ | |
2598 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\ | |
2599 "add %%"REG_S", %%"REG_c" \n\t"\ | |
2600 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2601 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2602 | |
2603 #else | |
2604 | |
2605 #define FUNNY_Y_CODE \ | |
2606 "movl (%%"REG_b"), %%esi \n\t"\ | |
2607 "call *%4 \n\t"\ | |
2608 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\ | |
2609 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2610 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2611 | |
2612 #endif | |
2613 | |
2614 FUNNY_Y_CODE | |
2615 FUNNY_Y_CODE | |
2616 FUNNY_Y_CODE | |
2617 FUNNY_Y_CODE | |
2618 FUNNY_Y_CODE | |
2619 FUNNY_Y_CODE | |
2620 FUNNY_Y_CODE | |
2621 FUNNY_Y_CODE | |
2622 | |
19396 | 2623 #if defined(PIC) |
2624 "mov %5, %%"REG_b" \n\t" | |
2625 #endif | |
18861 | 2626 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2627 "m" (funnyYCode) | |
19396 | 2628 #if defined(PIC) |
2629 ,"m" (ebxsave) | |
2630 #endif | |
2631 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D | |
2632 #if !defined(PIC) | |
2633 ,"%"REG_b | |
2634 #endif | |
18861 | 2635 ); |
2636 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; | |
2637 } | |
2638 else | |
2639 { | |
2640 #endif | |
2641 long xInc_shr16 = xInc >> 16; | |
2642 uint16_t xInc_mask = xInc & 0xffff; | |
2643 //NO MMX just normal asm ... | |
2644 asm volatile( | |
2645 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
19396 | 2646 "xor %%"REG_d", %%"REG_d" \n\t" // xx |
18861 | 2647 "xorl %%ecx, %%ecx \n\t" // 2*xalpha |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
2648 ASMALIGN(4) |
18861 | 2649 "1: \n\t" |
19396 | 2650 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] |
2651 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
18861 | 2652 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2653 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2654 "shll $16, %%edi \n\t" | |
2655 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2656 "mov %1, %%"REG_D" \n\t" | |
2657 "shrl $9, %%esi \n\t" | |
2658 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t" | |
2659 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
19396 | 2660 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry |
18861 | 2661 |
19396 | 2662 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] |
2663 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
18861 | 2664 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2665 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2666 "shll $16, %%edi \n\t" | |
2667 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2668 "mov %1, %%"REG_D" \n\t" | |
2669 "shrl $9, %%esi \n\t" | |
2670 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t" | |
2671 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
19396 | 2672 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry |
18861 | 2673 |
2674 | |
2675 "add $2, %%"REG_a" \n\t" | |
2676 "cmp %2, %%"REG_a" \n\t" | |
2677 " jb 1b \n\t" | |
2678 | |
2679 | |
2680 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask) | |
19396 | 2681 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" |
18861 | 2682 ); |
2683 #ifdef HAVE_MMX2 | |
2684 } //if MMX2 can't be used | |
2685 #endif | |
2686 #else | |
2687 int i; | |
2688 unsigned int xpos=0; | |
2689 for(i=0;i<dstWidth;i++) | |
2690 { | |
2691 register unsigned int xx=xpos>>16; | |
2692 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2693 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; | |
2694 xpos+=xInc; | |
2695 } | |
2696 #endif | |
2697 } | |
2698 } | |
2699 | |
2700 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2, | |
2701 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, | |
2702 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, | |
2703 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, | |
22218 | 2704 int32_t *mmx2FilterPos, uint8_t *pal) |
18861 | 2705 { |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
2706 if(srcFormat==PIX_FMT_YUYV422) |
18861 | 2707 { |
2708 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2709 src1= formatConvBuffer; | |
2710 src2= formatConvBuffer+2048; | |
2711 } | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
2712 else if(srcFormat==PIX_FMT_UYVY422) |
18861 | 2713 { |
2714 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2715 src1= formatConvBuffer; | |
2716 src2= formatConvBuffer+2048; | |
2717 } | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
2718 else if(srcFormat==PIX_FMT_RGB32) |
18861 | 2719 { |
2720 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2721 src1= formatConvBuffer; | |
2722 src2= formatConvBuffer+2048; | |
2723 } | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
2724 else if(srcFormat==PIX_FMT_BGR24) |
18861 | 2725 { |
2726 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2727 src1= formatConvBuffer; | |
2728 src2= formatConvBuffer+2048; | |
2729 } | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
2730 else if(srcFormat==PIX_FMT_BGR565) |
18861 | 2731 { |
2732 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2733 src1= formatConvBuffer; | |
2734 src2= formatConvBuffer+2048; | |
2735 } | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
2736 else if(srcFormat==PIX_FMT_BGR555) |
18861 | 2737 { |
2738 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2739 src1= formatConvBuffer; | |
2740 src2= formatConvBuffer+2048; | |
2741 } | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
2742 else if(srcFormat==PIX_FMT_BGR32) |
18861 | 2743 { |
2744 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2745 src1= formatConvBuffer; | |
2746 src2= formatConvBuffer+2048; | |
2747 } | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
2748 else if(srcFormat==PIX_FMT_RGB24) |
18861 | 2749 { |
2750 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2751 src1= formatConvBuffer; | |
2752 src2= formatConvBuffer+2048; | |
2753 } | |
20589
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2754 else if(srcFormat==PIX_FMT_RGB565) |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2755 { |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2756 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2757 src1= formatConvBuffer; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2758 src2= formatConvBuffer+2048; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2759 } |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2760 else if(srcFormat==PIX_FMT_RGB555) |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2761 { |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2762 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2763 src1= formatConvBuffer; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2764 src2= formatConvBuffer+2048; |
95695bfce2f0
Add support for conversions from the rgb565 and rgb555 formats
lucabe
parents:
20576
diff
changeset
|
2765 } |
18861 | 2766 else if(isGray(srcFormat)) |
2767 { | |
2768 return; | |
2769 } | |
22226 | 2770 else if(srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE) |
22218 | 2771 { |
2772 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW, pal); | |
2773 src1= formatConvBuffer; | |
2774 src2= formatConvBuffer+2048; | |
2775 } | |
18861 | 2776 |
2777 #ifdef HAVE_MMX | |
2778 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one) | |
2779 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) | |
2780 #else | |
2781 if(!(flags&SWS_FAST_BILINEAR)) | |
2782 #endif | |
2783 { | |
2784 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2785 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2786 } | |
2787 else // Fast Bilinear upscale / crap downscale | |
2788 { | |
20576 | 2789 #if defined(ARCH_X86) |
18861 | 2790 #ifdef HAVE_MMX2 |
2791 int i; | |
19396 | 2792 #if defined(PIC) |
2793 uint64_t ebxsave __attribute__((aligned(8))); | |
2794 #endif | |
18861 | 2795 if(canMMX2BeUsed) |
2796 { | |
2797 asm volatile( | |
19396 | 2798 #if defined(PIC) |
2799 "mov %%"REG_b", %6 \n\t" | |
2800 #endif | |
18861 | 2801 "pxor %%mm7, %%mm7 \n\t" |
2802 "mov %0, %%"REG_c" \n\t" | |
2803 "mov %1, %%"REG_D" \n\t" | |
2804 "mov %2, %%"REG_d" \n\t" | |
2805 "mov %3, %%"REG_b" \n\t" | |
2806 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2807 PREFETCH" (%%"REG_c") \n\t" | |
2808 PREFETCH" 32(%%"REG_c") \n\t" | |
2809 PREFETCH" 64(%%"REG_c") \n\t" | |
2810 | |
2811 #ifdef ARCH_X86_64 | |
2812 | |
2813 #define FUNNY_UV_CODE \ | |
2814 "movl (%%"REG_b"), %%esi \n\t"\ | |
2815 "call *%4 \n\t"\ | |
2816 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\ | |
2817 "add %%"REG_S", %%"REG_c" \n\t"\ | |
2818 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2819 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2820 | |
2821 #else | |
2822 | |
2823 #define FUNNY_UV_CODE \ | |
2824 "movl (%%"REG_b"), %%esi \n\t"\ | |
2825 "call *%4 \n\t"\ | |
2826 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\ | |
2827 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2828 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2829 | |
2830 #endif | |
2831 | |
2832 FUNNY_UV_CODE | |
2833 FUNNY_UV_CODE | |
2834 FUNNY_UV_CODE | |
2835 FUNNY_UV_CODE | |
2836 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
2837 "mov %5, %%"REG_c" \n\t" // src | |
2838 "mov %1, %%"REG_D" \n\t" // buf1 | |
2839 "add $4096, %%"REG_D" \n\t" | |
2840 PREFETCH" (%%"REG_c") \n\t" | |
2841 PREFETCH" 32(%%"REG_c") \n\t" | |
2842 PREFETCH" 64(%%"REG_c") \n\t" | |
2843 | |
2844 FUNNY_UV_CODE | |
2845 FUNNY_UV_CODE | |
2846 FUNNY_UV_CODE | |
2847 FUNNY_UV_CODE | |
2848 | |
19396 | 2849 #if defined(PIC) |
2850 "mov %6, %%"REG_b" \n\t" | |
2851 #endif | |
18861 | 2852 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2853 "m" (funnyUVCode), "m" (src2) | |
19396 | 2854 #if defined(PIC) |
2855 ,"m" (ebxsave) | |
2856 #endif | |
19400
0310c3310360
Fix compilation with -no-PIC and without -fomit-frame-pointer (used by
uau
parents:
19396
diff
changeset
|
2857 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D |
19396 | 2858 #if !defined(PIC) |
2859 ,"%"REG_b | |
2860 #endif | |
18861 | 2861 ); |
2862 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) | |
2863 { | |
2864 // printf("%d %d %d\n", dstWidth, i, srcW); | |
2865 dst[i] = src1[srcW-1]*128; | |
2866 dst[i+2048] = src2[srcW-1]*128; | |
2867 } | |
2868 } | |
2869 else | |
2870 { | |
2871 #endif | |
2872 long xInc_shr16 = (long) (xInc >> 16); | |
2873 uint16_t xInc_mask = xInc & 0xffff; | |
2874 asm volatile( | |
2875 "xor %%"REG_a", %%"REG_a" \n\t" // i | |
19396 | 2876 "xor %%"REG_d", %%"REG_d" \n\t" // xx |
18861 | 2877 "xorl %%ecx, %%ecx \n\t" // 2*xalpha |
19372
6334c14b38eb
Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents:
19181
diff
changeset
|
2878 ASMALIGN(4) |
18861 | 2879 "1: \n\t" |
2880 "mov %0, %%"REG_S" \n\t" | |
19396 | 2881 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx] |
2882 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1] | |
18861 | 2883 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2884 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2885 "shll $16, %%edi \n\t" | |
2886 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2887 "mov %1, %%"REG_D" \n\t" | |
2888 "shrl $9, %%esi \n\t" | |
2889 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t" | |
2890 | |
19396 | 2891 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx] |
2892 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1] | |
18861 | 2893 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2894 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2895 "shll $16, %%edi \n\t" | |
2896 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
2897 "mov %1, %%"REG_D" \n\t" | |
2898 "shrl $9, %%esi \n\t" | |
2899 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t" | |
2900 | |
2901 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
19396 | 2902 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry |
18861 | 2903 "add $1, %%"REG_a" \n\t" |
2904 "cmp %2, %%"REG_a" \n\t" | |
2905 " jb 1b \n\t" | |
2906 | |
2907 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, | |
2908 which is needed to support GCC-4.0 */ | |
2909 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4)) | |
2910 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), | |
2911 #else | |
2912 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), | |
2913 #endif | |
2914 "r" (src2) | |
19396 | 2915 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" |
18861 | 2916 ); |
2917 #ifdef HAVE_MMX2 | |
2918 } //if MMX2 can't be used | |
2919 #endif | |
2920 #else | |
2921 int i; | |
2922 unsigned int xpos=0; | |
2923 for(i=0;i<dstWidth;i++) | |
2924 { | |
2925 register unsigned int xx=xpos>>16; | |
2926 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2927 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |
2928 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | |
2929 /* slower | |
2930 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; | |
2931 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; | |
2932 */ | |
2933 xpos+=xInc; | |
2934 } | |
2935 #endif | |
2936 } | |
2937 } | |
2938 | |
2939 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | |
2940 int srcSliceH, uint8_t* dst[], int dstStride[]){ | |
2941 | |
2942 /* load a few things into local vars to make the code more readable? and faster */ | |
2943 const int srcW= c->srcW; | |
2944 const int dstW= c->dstW; | |
2945 const int dstH= c->dstH; | |
2946 const int chrDstW= c->chrDstW; | |
2947 const int chrSrcW= c->chrSrcW; | |
2948 const int lumXInc= c->lumXInc; | |
2949 const int chrXInc= c->chrXInc; | |
2950 const int dstFormat= c->dstFormat; | |
2951 const int srcFormat= c->srcFormat; | |
2952 const int flags= c->flags; | |
2953 const int canMMX2BeUsed= c->canMMX2BeUsed; | |
2954 int16_t *vLumFilterPos= c->vLumFilterPos; | |
2955 int16_t *vChrFilterPos= c->vChrFilterPos; | |
2956 int16_t *hLumFilterPos= c->hLumFilterPos; | |
2957 int16_t *hChrFilterPos= c->hChrFilterPos; | |
2958 int16_t *vLumFilter= c->vLumFilter; | |
2959 int16_t *vChrFilter= c->vChrFilter; | |
2960 int16_t *hLumFilter= c->hLumFilter; | |
2961 int16_t *hChrFilter= c->hChrFilter; | |
2962 int32_t *lumMmxFilter= c->lumMmxFilter; | |
2963 int32_t *chrMmxFilter= c->chrMmxFilter; | |
2964 const int vLumFilterSize= c->vLumFilterSize; | |
2965 const int vChrFilterSize= c->vChrFilterSize; | |
2966 const int hLumFilterSize= c->hLumFilterSize; | |
2967 const int hChrFilterSize= c->hChrFilterSize; | |
2968 int16_t **lumPixBuf= c->lumPixBuf; | |
2969 int16_t **chrPixBuf= c->chrPixBuf; | |
2970 const int vLumBufSize= c->vLumBufSize; | |
2971 const int vChrBufSize= c->vChrBufSize; | |
2972 uint8_t *funnyYCode= c->funnyYCode; | |
2973 uint8_t *funnyUVCode= c->funnyUVCode; | |
2974 uint8_t *formatConvBuffer= c->formatConvBuffer; | |
2975 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample; | |
2976 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample); | |
2977 int lastDstY; | |
22218 | 2978 uint8_t *pal=NULL; |
18861 | 2979 |
2980 /* vars whch will change and which we need to storw back in the context */ | |
2981 int dstY= c->dstY; | |
2982 int lumBufIndex= c->lumBufIndex; | |
2983 int chrBufIndex= c->chrBufIndex; | |
2984 int lastInLumBuf= c->lastInLumBuf; | |
2985 int lastInChrBuf= c->lastInChrBuf; | |
2986 | |
2987 if(isPacked(c->srcFormat)){ | |
22218 | 2988 pal= src[1]; |
18861 | 2989 src[0]= |
2990 src[1]= | |
2991 src[2]= src[0]; | |
2992 srcStride[0]= | |
2993 srcStride[1]= | |
2994 srcStride[2]= srcStride[0]; | |
2995 } | |
2996 srcStride[1]<<= c->vChrDrop; | |
2997 srcStride[2]<<= c->vChrDrop; | |
2998 | |
2999 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2], | |
3000 // (int)dst[0], (int)dst[1], (int)dst[2]); | |
3001 | |
3002 #if 0 //self test FIXME move to a vfilter or something | |
3003 { | |
3004 static volatile int i=0; | |
3005 i++; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
3006 if(srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH) |
18861 | 3007 selfTest(src, srcStride, c->srcW, c->srcH); |
3008 i--; | |
3009 } | |
3010 #endif | |
3011 | |
3012 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2], | |
3013 //dstStride[0],dstStride[1],dstStride[2]); | |
3014 | |
3015 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0) | |
3016 { | |
3017 static int firstTime=1; //FIXME move this into the context perhaps | |
3018 if(flags & SWS_PRINT_INFO && firstTime) | |
3019 { | |
21981 | 3020 av_log(c, AV_LOG_WARNING, "SwScaler: Warning: dstStride is not aligned!\n" |
18861 | 3021 "SwScaler: ->cannot do aligned memory acesses anymore\n"); |
3022 firstTime=0; | |
3023 } | |
3024 } | |
3025 | |
3026 /* Note the user might start scaling the picture in the middle so this will not get executed | |
3027 this is not really intended but works currently, so ppl might do it */ | |
3028 if(srcSliceY ==0){ | |
3029 lumBufIndex=0; | |
3030 chrBufIndex=0; | |
3031 dstY=0; | |
3032 lastInLumBuf= -1; | |
3033 lastInChrBuf= -1; | |
3034 } | |
3035 | |
3036 lastDstY= dstY; | |
3037 | |
3038 for(;dstY < dstH; dstY++){ | |
3039 unsigned char *dest =dst[0]+dstStride[0]*dstY; | |
3040 const int chrDstY= dstY>>c->chrDstVSubSample; | |
3041 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY; | |
3042 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY; | |
3043 | |
3044 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input | |
3045 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input | |
3046 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input | |
3047 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input | |
3048 | |
3049 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n", | |
3050 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample); | |
3051 //handle holes (FAST_BILINEAR & weird filters) | |
3052 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; | |
3053 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; | |
3054 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize); | |
3055 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1) | |
3056 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1) | |
3057 | |
3058 // Do we have enough lines in this slice to output the dstY line | |
3059 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample)) | |
3060 { | |
3061 //Do horizontal scaling | |
3062 while(lastInLumBuf < lastLumSrcY) | |
3063 { | |
3064 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; | |
3065 lumBufIndex++; | |
3066 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY); | |
3067 ASSERT(lumBufIndex < 2*vLumBufSize) | |
3068 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
3069 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
3070 // printf("%d %d\n", lumBufIndex, vLumBufSize); | |
3071 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, | |
3072 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, | |
3073 funnyYCode, c->srcFormat, formatConvBuffer, | |
22218 | 3074 c->lumMmx2Filter, c->lumMmx2FilterPos, pal); |
18861 | 3075 lastInLumBuf++; |
3076 } | |
3077 while(lastInChrBuf < lastChrSrcY) | |
3078 { | |
3079 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; | |
3080 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; | |
3081 chrBufIndex++; | |
3082 ASSERT(chrBufIndex < 2*vChrBufSize) | |
3083 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)) | |
3084 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0) | |
3085 //FIXME replace parameters through context struct (some at least) | |
3086 | |
3087 if(!(isGray(srcFormat) || isGray(dstFormat))) | |
3088 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, | |
3089 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, | |
3090 funnyUVCode, c->srcFormat, formatConvBuffer, | |
22218 | 3091 c->chrMmx2Filter, c->chrMmx2FilterPos, pal); |
18861 | 3092 lastInChrBuf++; |
3093 } | |
3094 //wrap buf index around to stay inside the ring buffer | |
3095 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
3096 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
3097 } | |
3098 else // not enough lines left in this slice -> load the rest in the buffer | |
3099 { | |
3100 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n", | |
3101 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY, | |
3102 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize, | |
3103 vChrBufSize, vLumBufSize);*/ | |
3104 | |
3105 //Do horizontal scaling | |
3106 while(lastInLumBuf+1 < srcSliceY + srcSliceH) | |
3107 { | |
3108 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; | |
3109 lumBufIndex++; | |
3110 ASSERT(lumBufIndex < 2*vLumBufSize) | |
3111 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
3112 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
3113 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, | |
3114 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, | |
3115 funnyYCode, c->srcFormat, formatConvBuffer, | |
22218 | 3116 c->lumMmx2Filter, c->lumMmx2FilterPos, pal); |
18861 | 3117 lastInLumBuf++; |
3118 } | |
3119 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH)) | |
3120 { | |
3121 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; | |
3122 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; | |
3123 chrBufIndex++; | |
3124 ASSERT(chrBufIndex < 2*vChrBufSize) | |
3125 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH) | |
3126 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0) | |
3127 | |
3128 if(!(isGray(srcFormat) || isGray(dstFormat))) | |
3129 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, | |
3130 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, | |
3131 funnyUVCode, c->srcFormat, formatConvBuffer, | |
22218 | 3132 c->chrMmx2Filter, c->chrMmx2FilterPos, pal); |
18861 | 3133 lastInChrBuf++; |
3134 } | |
3135 //wrap buf index around to stay inside the ring buffer | |
3136 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
3137 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
3138 break; //we can't output a dstY line so let's try with the next slice | |
3139 } | |
3140 | |
3141 #ifdef HAVE_MMX | |
3142 b5Dither= dither8[dstY&1]; | |
3143 g6Dither= dither4[dstY&1]; | |
3144 g5Dither= dither8[dstY&1]; | |
3145 r5Dither= dither8[(dstY+1)&1]; | |
3146 #endif | |
3147 if(dstY < dstH-2) | |
3148 { | |
3149 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
3150 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
3151 #ifdef HAVE_MMX | |
3152 int i; | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3153 if(flags & SWS_ACCURATE_RND){ |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3154 for(i=0; i<vLumFilterSize; i+=2){ |
21756 | 3155 lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i ]; |
3156 lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)]; | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3157 lumMmxFilter[2*i+2]= |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3158 lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i ] |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3159 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0); |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3160 } |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3161 for(i=0; i<vChrFilterSize; i+=2){ |
21756 | 3162 chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i ]; |
3163 chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)]; | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3164 chrMmxFilter[2*i+2]= |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3165 chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i ] |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3166 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0); |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3167 } |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3168 }else{ |
18861 | 3169 for(i=0; i<vLumFilterSize; i++) |
3170 { | |
3171 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i]; | |
22383
508e55817748
Fix a possible crash on 64 bit systems when the lumSrcPtr or chrSrcPtr
reimar
parents:
22321
diff
changeset
|
3172 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32; |
18861 | 3173 lumMmxFilter[4*i+2]= |
3174 lumMmxFilter[4*i+3]= | |
3175 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; | |
3176 } | |
3177 for(i=0; i<vChrFilterSize; i++) | |
3178 { | |
3179 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i]; | |
22383
508e55817748
Fix a possible crash on 64 bit systems when the lumSrcPtr or chrSrcPtr
reimar
parents:
22321
diff
changeset
|
3180 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32; |
18861 | 3181 chrMmxFilter[4*i+2]= |
3182 chrMmxFilter[4*i+3]= | |
3183 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; | |
3184 } | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3185 } |
18861 | 3186 #endif |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
3187 if(dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){ |
18861 | 3188 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; |
3189 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi | |
3190 RENAME(yuv2nv12X)(c, | |
3191 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
3192 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3193 dest, uDest, dstW, chrDstW, dstFormat); | |
3194 } | |
3195 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like | |
3196 { | |
3197 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
3198 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
3199 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12 | |
3200 { | |
3201 int16_t *lumBuf = lumPixBuf[0]; | |
3202 int16_t *chrBuf= chrPixBuf[0]; | |
3203 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW); | |
3204 } | |
3205 else //General YV12 | |
3206 { | |
3207 RENAME(yuv2yuvX)(c, | |
3208 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
3209 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3210 dest, uDest, vDest, dstW, chrDstW); | |
3211 } | |
3212 } | |
3213 else | |
3214 { | |
3215 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
3216 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
3217 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB | |
3218 { | |
3219 int chrAlpha= vChrFilter[2*dstY+1]; | |
3220 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), | |
3221 dest, dstW, chrAlpha, dstFormat, flags, dstY); | |
3222 } | |
3223 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB | |
3224 { | |
3225 int lumAlpha= vLumFilter[2*dstY+1]; | |
3226 int chrAlpha= vChrFilter[2*dstY+1]; | |
19172
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3227 lumMmxFilter[2]= |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3228 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001; |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3229 chrMmxFilter[2]= |
bae6c99a99cc
vertical scaler with accurate rounding, some people on doom9 can see +-1 errors
michael
parents:
18861
diff
changeset
|
3230 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001; |
18861 | 3231 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), |
3232 dest, dstW, lumAlpha, chrAlpha, dstY); | |
3233 } | |
3234 else //General RGB | |
3235 { | |
3236 RENAME(yuv2packedX)(c, | |
3237 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
3238 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3239 dest, dstW, dstY); | |
3240 } | |
3241 } | |
3242 } | |
3243 else // hmm looks like we can't use MMX here without overwriting this array's tail | |
3244 { | |
3245 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
3246 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
19872
8e50cba9fe03
Remove the dependency of libswscale on img_format.h
lucabe
parents:
19594
diff
changeset
|
3247 if(dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){ |
18861 | 3248 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; |
3249 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi | |
3250 yuv2nv12XinC( | |
3251 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
3252 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3253 dest, uDest, dstW, chrDstW, dstFormat); | |
3254 } | |
3255 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 | |
3256 { | |
3257 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
3258 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
3259 yuv2yuvXinC( | |
3260 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
3261 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3262 dest, uDest, vDest, dstW, chrDstW); | |
3263 } | |
3264 else | |
3265 { | |
3266 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
3267 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
3268 yuv2packedXinC(c, | |
3269 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
3270 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
3271 dest, dstW, dstY); | |
3272 } | |
3273 } | |
3274 } | |
3275 | |
3276 #ifdef HAVE_MMX | |
3277 __asm __volatile(SFENCE:::"memory"); | |
3278 __asm __volatile(EMMS:::"memory"); | |
3279 #endif | |
3280 /* store changed local vars back in the context */ | |
3281 c->dstY= dstY; | |
3282 c->lumBufIndex= lumBufIndex; | |
3283 c->chrBufIndex= chrBufIndex; | |
3284 c->lastInLumBuf= lastInLumBuf; | |
3285 c->lastInChrBuf= lastInChrBuf; | |
3286 | |
3287 return dstY - lastDstY; | |
3288 } |