Mercurial > mplayer.hg
annotate postproc/swscale_template.c @ 18326:36ceb960ef47
small spelling/grammar fixes
author | diego |
---|---|
date | Fri, 28 Apr 2006 01:52:45 +0000 |
parents | 7b408d60de9e |
children | b10d4b3cb9ec |
rev | line source |
---|---|
4295 | 1 /* |
9476
eff727517e6b
yuv2rgb brightness/contrast/saturation/different colorspaces support finished
michael
parents:
9434
diff
changeset
|
2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> |
2216 | 3 |
4295 | 4 This program is free software; you can redistribute it and/or modify |
5 it under the terms of the GNU General Public License as published by | |
6 the Free Software Foundation; either version 2 of the License, or | |
7 (at your option) any later version. | |
2216 | 8 |
4295 | 9 This program is distributed in the hope that it will be useful, |
10 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 GNU General Public License for more details. | |
13 | |
14 You should have received a copy of the GNU General Public License | |
15 along with this program; if not, write to the Free Software | |
17367
401b440a6d76
Update licensing information: The FSF changed postal address.
diego
parents:
16739
diff
changeset
|
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
4295 | 17 */ |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
18 |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
19 #include "asmalign.h" |
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
20 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
21 #undef REAL_MOVNTQ |
2540 | 22 #undef MOVNTQ |
2680 | 23 #undef PAVGB |
3136 | 24 #undef PREFETCH |
25 #undef PREFETCHW | |
26 #undef EMMS | |
27 #undef SFENCE | |
28 | |
29 #ifdef HAVE_3DNOW | |
30 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
31 #define EMMS "femms" | |
32 #else | |
33 #define EMMS "emms" | |
34 #endif | |
35 | |
36 #ifdef HAVE_3DNOW | |
37 #define PREFETCH "prefetch" | |
38 #define PREFETCHW "prefetchw" | |
39 #elif defined ( HAVE_MMX2 ) | |
40 #define PREFETCH "prefetchnta" | |
41 #define PREFETCHW "prefetcht0" | |
42 #else | |
43 #define PREFETCH "/nop" | |
44 #define PREFETCHW "/nop" | |
45 #endif | |
46 | |
47 #ifdef HAVE_MMX2 | |
48 #define SFENCE "sfence" | |
49 #else | |
50 #define SFENCE "/nop" | |
51 #endif | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
52 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
53 #ifdef HAVE_MMX2 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
54 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
55 #elif defined (HAVE_3DNOW) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
56 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
57 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
58 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
59 #ifdef HAVE_MMX2 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
60 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
61 #else |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
62 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
63 #endif |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
64 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
65 |
12017
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
66 #ifdef HAVE_ALTIVEC |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
67 #include "swscale_altivec_template.c" |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
68 #endif |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
69 |
9413 | 70 #define YSCALEYUV2YV12X(x, offset) \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
71 "xor %%"REG_a", %%"REG_a" \n\t"\ |
11122 | 72 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ |
73 "movq %%mm3, %%mm4 \n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
74 "lea " offset "(%0), %%"REG_d" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
75 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
76 ASMALIGN16 /* FIXME Unroll? */\ |
3344 | 77 "1: \n\t"\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
78 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
79 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
80 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
81 "add $16, %%"REG_d" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
82 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
83 "test %%"REG_S", %%"REG_S" \n\t"\ |
3344 | 84 "pmulhw %%mm0, %%mm2 \n\t"\ |
85 "pmulhw %%mm0, %%mm5 \n\t"\ | |
86 "paddw %%mm2, %%mm3 \n\t"\ | |
87 "paddw %%mm5, %%mm4 \n\t"\ | |
88 " jnz 1b \n\t"\ | |
89 "psraw $3, %%mm3 \n\t"\ | |
90 "psraw $3, %%mm4 \n\t"\ | |
91 "packuswb %%mm4, %%mm3 \n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
92 MOVNTQ(%%mm3, (%1, %%REGa))\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
93 "add $8, %%"REG_a" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
94 "cmp %2, %%"REG_a" \n\t"\ |
11122 | 95 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ |
96 "movq %%mm3, %%mm4 \n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
97 "lea " offset "(%0), %%"REG_d" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
98 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
3344 | 99 "jb 1b \n\t" |
100 | |
101 #define YSCALEYUV2YV121 \ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
102 "mov %2, %%"REG_a" \n\t"\ |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
103 ASMALIGN16 /* FIXME Unroll? */\ |
3344 | 104 "1: \n\t"\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
105 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
106 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\ |
3344 | 107 "psraw $7, %%mm0 \n\t"\ |
108 "psraw $7, %%mm1 \n\t"\ | |
109 "packuswb %%mm1, %%mm0 \n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
110 MOVNTQ(%%mm0, (%1, %%REGa))\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
111 "add $8, %%"REG_a" \n\t"\ |
3344 | 112 "jnc 1b \n\t" |
113 | |
114 /* | |
115 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
116 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
117 "r" (dest), "m" (dstW), | |
118 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
119 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
120 */ | |
7723 | 121 #define YSCALEYUV2PACKEDX \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
122 "xor %%"REG_a", %%"REG_a" \n\t"\ |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
123 ASMALIGN16\ |
9413 | 124 "nop \n\t"\ |
3344 | 125 "1: \n\t"\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
126 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
127 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
11122 | 128 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ |
129 "movq %%mm3, %%mm4 \n\t"\ | |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
130 ASMALIGN16\ |
3344 | 131 "2: \n\t"\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
132 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
133 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
134 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
135 "add $16, %%"REG_d" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
136 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
3344 | 137 "pmulhw %%mm0, %%mm2 \n\t"\ |
138 "pmulhw %%mm0, %%mm5 \n\t"\ | |
139 "paddw %%mm2, %%mm3 \n\t"\ | |
140 "paddw %%mm5, %%mm4 \n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
141 "test %%"REG_S", %%"REG_S" \n\t"\ |
3344 | 142 " jnz 2b \n\t"\ |
143 \ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
144 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
145 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
11122 | 146 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\ |
147 "movq %%mm1, %%mm7 \n\t"\ | |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
148 ASMALIGN16\ |
3344 | 149 "2: \n\t"\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
150 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
151 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
152 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
153 "add $16, %%"REG_d" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
154 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
3344 | 155 "pmulhw %%mm0, %%mm2 \n\t"\ |
156 "pmulhw %%mm0, %%mm5 \n\t"\ | |
157 "paddw %%mm2, %%mm1 \n\t"\ | |
158 "paddw %%mm5, %%mm7 \n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
159 "test %%"REG_S", %%"REG_S" \n\t"\ |
3344 | 160 " jnz 2b \n\t"\ |
7723 | 161 |
162 | |
163 #define YSCALEYUV2RGBX \ | |
164 YSCALEYUV2PACKEDX\ | |
9413 | 165 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ |
166 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ | |
3344 | 167 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
168 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
9413 | 169 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ |
170 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ | |
3344 | 171 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
9413 | 172 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ |
173 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ | |
174 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ | |
175 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ | |
176 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ | |
177 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ | |
3344 | 178 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
179 "paddw %%mm3, %%mm4 \n\t"\ | |
180 "movq %%mm2, %%mm0 \n\t"\ | |
181 "movq %%mm5, %%mm6 \n\t"\ | |
182 "movq %%mm4, %%mm3 \n\t"\ | |
183 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
184 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
185 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
186 "paddw %%mm1, %%mm2 \n\t"\ | |
187 "paddw %%mm1, %%mm5 \n\t"\ | |
188 "paddw %%mm1, %%mm4 \n\t"\ | |
189 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
190 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
191 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
192 "paddw %%mm7, %%mm0 \n\t"\ | |
193 "paddw %%mm7, %%mm6 \n\t"\ | |
194 "paddw %%mm7, %%mm3 \n\t"\ | |
195 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
196 "packuswb %%mm0, %%mm2 \n\t"\ | |
197 "packuswb %%mm6, %%mm5 \n\t"\ | |
198 "packuswb %%mm3, %%mm4 \n\t"\ | |
199 "pxor %%mm7, %%mm7 \n\t" | |
9413 | 200 #if 0 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
201 #define FULL_YSCALEYUV2RGB \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
202 "pxor %%mm7, %%mm7 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
203 "movd %6, %%mm6 \n\t" /*yalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
204 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
205 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
206 "movd %7, %%mm5 \n\t" /*uvalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
207 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
208 "punpcklwd %%mm5, %%mm5 \n\t"\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
209 "xor %%"REG_a", %%"REG_a" \n\t"\ |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
210 ASMALIGN16\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
211 "1: \n\t"\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
212 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
213 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
214 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
215 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
216 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
217 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
218 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
219 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
220 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
221 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
222 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
223 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
224 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
225 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
226 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ |
4248 | 227 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\ |
228 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\ | |
229 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
230 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
231 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
232 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
233 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
4248 | 234 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
235 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
4248 | 236 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
237 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
4248 | 238 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
239 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
240 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
241 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ |
4248 | 242 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\ |
243 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
244 "paddw %%mm1, %%mm3 \n\t" /* B*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
245 "paddw %%mm1, %%mm0 \n\t" /* R*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
246 "packuswb %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
247 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
248 "packuswb %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
249 "paddw %%mm4, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
250 "paddw %%mm2, %%mm1 \n\t" /* G*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
251 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
252 "packuswb %%mm1, %%mm1 \n\t" |
9413 | 253 #endif |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
254 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
255 #define REAL_YSCALEYUV2PACKED(index, c) \ |
9414 | 256 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ |
257 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\ | |
258 "psraw $3, %%mm0 \n\t"\ | |
259 "psraw $3, %%mm1 \n\t"\ | |
260 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\ | |
261 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
262 "xor "#index", "#index" \n\t"\ |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
263 ASMALIGN16\ |
7723 | 264 "1: \n\t"\ |
9414 | 265 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
266 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
267 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
268 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
7723 | 269 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
270 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
9414 | 271 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ |
7723 | 272 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
273 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
274 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
275 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
276 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
277 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
9414 | 278 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ |
279 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
280 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ | |
281 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ | |
7723 | 282 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
283 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
9414 | 284 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
285 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
7723 | 286 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
287 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
288 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
289 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
290 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
291 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
292 |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
293 #define REAL_YSCALEYUV2RGB(index, c) \ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
294 "xor "#index", "#index" \n\t"\ |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
295 ASMALIGN16\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
296 "1: \n\t"\ |
9414 | 297 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
298 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
299 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\ | |
300 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
301 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
302 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ |
9414 | 303 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
304 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
305 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
306 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
307 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
308 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
309 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
9414 | 310 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
311 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
312 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
313 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
9414 | 314 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ |
315 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
316 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
9414 | 317 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ |
318 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
319 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\ | |
320 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
321 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
322 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ |
9414 | 323 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
324 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
325 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
326 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
327 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
328 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
9414 | 329 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ |
330 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
331 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
332 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
333 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
334 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
335 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
336 "paddw %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
337 "movq %%mm2, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
338 "movq %%mm5, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
339 "movq %%mm4, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
340 "punpcklwd %%mm2, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
341 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
342 "punpcklwd %%mm4, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
343 "paddw %%mm1, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
344 "paddw %%mm1, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
345 "paddw %%mm1, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
346 "punpckhwd %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
347 "punpckhwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
348 "punpckhwd %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
349 "paddw %%mm7, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
350 "paddw %%mm7, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
351 "paddw %%mm7, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
352 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
353 "packuswb %%mm0, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
354 "packuswb %%mm6, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
355 "packuswb %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
356 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
357 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c) |
7723 | 358 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
359 #define REAL_YSCALEYUV2PACKED1(index, c) \ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
360 "xor "#index", "#index" \n\t"\ |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
361 ASMALIGN16\ |
7723 | 362 "1: \n\t"\ |
9417 | 363 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ |
364 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
7723 | 365 "psraw $7, %%mm3 \n\t" \ |
366 "psraw $7, %%mm4 \n\t" \ | |
9417 | 367 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
368 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
7723 | 369 "psraw $7, %%mm1 \n\t" \ |
370 "psraw $7, %%mm7 \n\t" \ | |
371 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
372 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
373 |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
374 #define REAL_YSCALEYUV2RGB1(index, c) \ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
375 "xor "#index", "#index" \n\t"\ |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
376 ASMALIGN16\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
377 "1: \n\t"\ |
9417 | 378 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ |
379 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
380 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
381 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
9417 | 382 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
383 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
384 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
385 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
9417 | 386 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ |
387 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
388 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
9417 | 389 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
390 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
391 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
392 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
9417 | 393 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ |
394 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
395 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
396 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
397 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
398 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
399 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
400 "paddw %%mm3, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
401 "movq %%mm2, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
402 "movq %%mm5, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
403 "movq %%mm4, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
404 "punpcklwd %%mm2, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
405 "punpcklwd %%mm5, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
406 "punpcklwd %%mm4, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
407 "paddw %%mm1, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
408 "paddw %%mm1, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
409 "paddw %%mm1, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
410 "punpckhwd %%mm0, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
411 "punpckhwd %%mm6, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
412 "punpckhwd %%mm3, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
413 "paddw %%mm7, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
414 "paddw %%mm7, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
415 "paddw %%mm7, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
416 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
417 "packuswb %%mm0, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
418 "packuswb %%mm6, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
419 "packuswb %%mm3, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
420 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
421 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
422 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
423 #define REAL_YSCALEYUV2PACKED1b(index, c) \ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
424 "xor "#index", "#index" \n\t"\ |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
425 ASMALIGN16\ |
7723 | 426 "1: \n\t"\ |
9417 | 427 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
428 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
429 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
430 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
7723 | 431 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
432 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
433 "psrlw $8, %%mm3 \n\t" \ | |
434 "psrlw $8, %%mm4 \n\t" \ | |
9417 | 435 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
436 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
7723 | 437 "psraw $7, %%mm1 \n\t" \ |
438 "psraw $7, %%mm7 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
439 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) |
7723 | 440 |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
441 // do vertical chrominance interpolation |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
442 #define REAL_YSCALEYUV2RGB1b(index, c) \ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
443 "xor "#index", "#index" \n\t"\ |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
444 ASMALIGN16\ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
445 "1: \n\t"\ |
9417 | 446 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
447 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
448 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
449 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
2576 | 450 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
451 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
3344 | 452 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ |
453 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ | |
9417 | 454 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
455 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
456 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
457 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
9417 | 458 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ |
459 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
460 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
9417 | 461 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
462 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
463 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
464 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
9417 | 465 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ |
466 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
467 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
468 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
469 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
470 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
471 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
472 "paddw %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
473 "movq %%mm2, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
474 "movq %%mm5, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
475 "movq %%mm4, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
476 "punpcklwd %%mm2, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
477 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
478 "punpcklwd %%mm4, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
479 "paddw %%mm1, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
480 "paddw %%mm1, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
481 "paddw %%mm1, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
482 "punpckhwd %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
483 "punpckhwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
484 "punpckhwd %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
485 "paddw %%mm7, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
486 "paddw %%mm7, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
487 "paddw %%mm7, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
488 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
489 "packuswb %%mm0, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
490 "packuswb %%mm6, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
491 "packuswb %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
492 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
493 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
494 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
495 #define REAL_WRITEBGR32(dst, dstw, index) \ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
496 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
497 "movq %%mm2, %%mm1 \n\t" /* B */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
498 "movq %%mm5, %%mm6 \n\t" /* R */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
499 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
500 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
501 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
502 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
503 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
504 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
505 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
506 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
507 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
508 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
509 \ |
9414 | 510 MOVNTQ(%%mm0, (dst, index, 4))\ |
511 MOVNTQ(%%mm2, 8(dst, index, 4))\ | |
512 MOVNTQ(%%mm1, 16(dst, index, 4))\ | |
513 MOVNTQ(%%mm3, 24(dst, index, 4))\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
514 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
515 "add $8, "#index" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
516 "cmp "#dstw", "#index" \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
517 " jb 1b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
518 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
519 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
520 #define REAL_WRITEBGR16(dst, dstw, index) \ |
4248 | 521 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
522 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ | |
523 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
2669 | 524 "psrlq $3, %%mm2 \n\t"\ |
525 \ | |
526 "movq %%mm2, %%mm1 \n\t"\ | |
527 "movq %%mm4, %%mm3 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
528 \ |
2669 | 529 "punpcklbw %%mm7, %%mm3 \n\t"\ |
530 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
531 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
532 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
533 \ |
2669 | 534 "psllq $3, %%mm3 \n\t"\ |
535 "psllq $3, %%mm4 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
536 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
537 "por %%mm3, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
538 "por %%mm4, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
539 \ |
9414 | 540 MOVNTQ(%%mm2, (dst, index, 2))\ |
541 MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
542 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
543 "add $8, "#index" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
544 "cmp "#dstw", "#index" \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
545 " jb 1b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
546 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
547 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
548 #define REAL_WRITEBGR15(dst, dstw, index) \ |
4248 | 549 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
550 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ | |
551 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
2669 | 552 "psrlq $3, %%mm2 \n\t"\ |
553 "psrlq $1, %%mm5 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
554 \ |
2669 | 555 "movq %%mm2, %%mm1 \n\t"\ |
556 "movq %%mm4, %%mm3 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
557 \ |
2669 | 558 "punpcklbw %%mm7, %%mm3 \n\t"\ |
559 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
560 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
561 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
562 \ |
2669 | 563 "psllq $2, %%mm3 \n\t"\ |
564 "psllq $2, %%mm4 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
565 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
566 "por %%mm3, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
567 "por %%mm4, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
568 \ |
9414 | 569 MOVNTQ(%%mm2, (dst, index, 2))\ |
570 MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
571 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
572 "add $8, "#index" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
573 "cmp "#dstw", "#index" \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
574 " jb 1b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
575 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index) |
2669 | 576 |
9414 | 577 #define WRITEBGR24OLD(dst, dstw, index) \ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
578 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
579 "movq %%mm2, %%mm1 \n\t" /* B */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
580 "movq %%mm5, %%mm6 \n\t" /* R */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
581 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
582 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
583 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
584 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
585 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
586 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ |
2326 | 587 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ |
588 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
589 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
590 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
591 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
592 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
593 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ |
4248 | 594 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\ |
595 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
596 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
597 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
598 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
599 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
600 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
601 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
602 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
603 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
604 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ |
4248 | 605 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
606 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
607 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ |
4248 | 608 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\ |
609 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
610 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
611 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
612 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
613 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
614 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
615 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
616 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
617 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ |
4248 | 618 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\ |
619 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
620 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
621 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
622 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
623 \ |
9414 | 624 MOVNTQ(%%mm0, (dst))\ |
625 MOVNTQ(%%mm2, 8(dst))\ | |
626 MOVNTQ(%%mm3, 16(dst))\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
627 "add $24, "#dst" \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
628 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
629 "add $8, "#index" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
630 "cmp "#dstw", "#index" \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
631 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
632 |
9414 | 633 #define WRITEBGR24MMX(dst, dstw, index) \ |
2730 | 634 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
635 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
636 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
637 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
638 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
639 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
640 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
641 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
642 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
643 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
644 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
645 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
646 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
647 \ | |
648 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
649 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ | |
650 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ | |
651 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ | |
652 \ | |
653 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ | |
654 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ | |
655 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ | |
656 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ | |
657 \ | |
658 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ | |
659 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ | |
660 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ | |
661 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ | |
662 \ | |
663 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
664 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ | |
665 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ | |
666 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
9414 | 667 MOVNTQ(%%mm0, (dst))\ |
2730 | 668 \ |
669 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ | |
670 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ | |
671 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ | |
672 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ | |
9414 | 673 MOVNTQ(%%mm6, 8(dst))\ |
2730 | 674 \ |
675 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ | |
676 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
677 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ | |
9414 | 678 MOVNTQ(%%mm5, 16(dst))\ |
2730 | 679 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
680 "add $24, "#dst" \n\t"\ |
2730 | 681 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
682 "add $8, "#index" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
683 "cmp "#dstw", "#index" \n\t"\ |
2730 | 684 " jb 1b \n\t" |
685 | |
9414 | 686 #define WRITEBGR24MMX2(dst, dstw, index) \ |
2730 | 687 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
4248 | 688 "movq "MANGLE(M24A)", %%mm0 \n\t"\ |
689 "movq "MANGLE(M24C)", %%mm7 \n\t"\ | |
2730 | 690 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ |
691 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ | |
692 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ | |
693 \ | |
694 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ | |
695 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ | |
696 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ | |
697 \ | |
698 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ | |
699 "por %%mm1, %%mm6 \n\t"\ | |
700 "por %%mm3, %%mm6 \n\t"\ | |
9414 | 701 MOVNTQ(%%mm6, (dst))\ |
2730 | 702 \ |
703 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ | |
704 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ | |
705 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ | |
706 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ | |
707 \ | |
4248 | 708 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ |
2730 | 709 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ |
710 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ | |
711 \ | |
712 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ | |
713 "por %%mm3, %%mm6 \n\t"\ | |
9414 | 714 MOVNTQ(%%mm6, 8(dst))\ |
2730 | 715 \ |
716 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ | |
717 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ | |
718 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ | |
719 \ | |
720 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ | |
721 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ | |
4248 | 722 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ |
2730 | 723 \ |
724 "por %%mm1, %%mm3 \n\t"\ | |
725 "por %%mm3, %%mm6 \n\t"\ | |
9414 | 726 MOVNTQ(%%mm6, 16(dst))\ |
2730 | 727 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
728 "add $24, "#dst" \n\t"\ |
2730 | 729 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
730 "add $8, "#index" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
731 "cmp "#dstw", "#index" \n\t"\ |
2730 | 732 " jb 1b \n\t" |
733 | |
734 #ifdef HAVE_MMX2 | |
3126 | 735 #undef WRITEBGR24 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
736 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index) |
2730 | 737 #else |
3126 | 738 #undef WRITEBGR24 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
739 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) |
2730 | 740 #endif |
741 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
742 #define REAL_WRITEYUY2(dst, dstw, index) \ |
7723 | 743 "packuswb %%mm3, %%mm3 \n\t"\ |
744 "packuswb %%mm4, %%mm4 \n\t"\ | |
745 "packuswb %%mm7, %%mm1 \n\t"\ | |
746 "punpcklbw %%mm4, %%mm3 \n\t"\ | |
747 "movq %%mm1, %%mm7 \n\t"\ | |
748 "punpcklbw %%mm3, %%mm1 \n\t"\ | |
749 "punpckhbw %%mm3, %%mm7 \n\t"\ | |
750 \ | |
9414 | 751 MOVNTQ(%%mm1, (dst, index, 2))\ |
752 MOVNTQ(%%mm7, 8(dst, index, 2))\ | |
7723 | 753 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
754 "add $8, "#index" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
755 "cmp "#dstw", "#index" \n\t"\ |
7723 | 756 " jb 1b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
757 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) |
7723 | 758 |
759 | |
9413 | 760 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
3344 | 761 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
762 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) |
2519 | 763 { |
3344 | 764 #ifdef HAVE_MMX |
765 if(uDest != NULL) | |
766 { | |
767 asm volatile( | |
9413 | 768 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET) |
769 :: "r" (&c->redDither), | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
770 "r" (uDest), "p" (chrDstW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
771 : "%"REG_a, "%"REG_d, "%"REG_S |
3344 | 772 ); |
2519 | 773 |
3344 | 774 asm volatile( |
9413 | 775 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET) |
776 :: "r" (&c->redDither), | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
777 "r" (vDest), "p" (chrDstW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
778 : "%"REG_a, "%"REG_d, "%"REG_S |
3344 | 779 ); |
780 } | |
2521 | 781 |
3344 | 782 asm volatile( |
9413 | 783 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET) |
784 :: "r" (&c->redDither), | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
785 "r" (dest), "p" (dstW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
786 : "%"REG_a, "%"REG_d, "%"REG_S |
3344 | 787 ); |
788 #else | |
12017
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
789 #ifdef HAVE_ALTIVEC |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
790 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
791 chrFilter, chrSrc, chrFilterSize, |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
792 dest, uDest, vDest, dstW, chrDstW); |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
793 #else //HAVE_ALTIVEC |
6540 | 794 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, |
3352 | 795 chrFilter, chrSrc, chrFilterSize, |
6540 | 796 dest, uDest, vDest, dstW, chrDstW); |
12017
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
797 #endif //!HAVE_ALTIVEC |
3344 | 798 #endif |
799 } | |
800 | |
14715 | 801 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
802 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
803 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat) | |
804 { | |
805 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, | |
806 chrFilter, chrSrc, chrFilterSize, | |
807 dest, uDest, dstW, chrDstW, dstFormat); | |
808 } | |
809 | |
3344 | 810 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
811 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) |
3344 | 812 { |
813 #ifdef HAVE_MMX | |
814 if(uDest != NULL) | |
815 { | |
816 asm volatile( | |
817 YSCALEYUV2YV121 | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
818 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW), |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
819 "g" (-chrDstW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
820 : "%"REG_a |
3344 | 821 ); |
822 | |
823 asm volatile( | |
824 YSCALEYUV2YV121 | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
825 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW), |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
826 "g" (-chrDstW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
827 : "%"REG_a |
3344 | 828 ); |
2519 | 829 } |
3344 | 830 |
831 asm volatile( | |
832 YSCALEYUV2YV121 | |
833 :: "r" (lumSrc + dstW), "r" (dest + dstW), | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
834 "g" (-dstW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
835 : "%"REG_a |
3344 | 836 ); |
837 #else | |
838 int i; | |
839 for(i=0; i<dstW; i++) | |
840 { | |
841 int val= lumSrc[i]>>7; | |
6503 | 842 |
843 if(val&256){ | |
844 if(val<0) val=0; | |
845 else val=255; | |
846 } | |
3344 | 847 |
6503 | 848 dest[i]= val; |
3344 | 849 } |
850 | |
851 if(uDest != NULL) | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
852 for(i=0; i<chrDstW; i++) |
3344 | 853 { |
854 int u=chrSrc[i]>>7; | |
855 int v=chrSrc[i + 2048]>>7; | |
856 | |
6503 | 857 if((u|v)&256){ |
858 if(u<0) u=0; | |
859 else if (u>255) u=255; | |
860 if(v<0) v=0; | |
861 else if (v>255) v=255; | |
862 } | |
863 | |
864 uDest[i]= u; | |
865 vDest[i]= v; | |
3344 | 866 } |
867 #endif | |
2519 | 868 } |
869 | |
3344 | 870 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
871 /** |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
872 * vertical scale YV12 to RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
873 */ |
7723 | 874 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
3344 | 875 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, |
9413 | 876 uint8_t *dest, int dstW, int dstY) |
3344 | 877 { |
9413 | 878 int dummy=0; |
6578 | 879 switch(c->dstFormat) |
3344 | 880 { |
881 #ifdef HAVE_MMX | |
6578 | 882 case IMGFMT_BGR32: |
3344 | 883 { |
884 asm volatile( | |
885 YSCALEYUV2RGBX | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
886 WRITEBGR32(%4, %5, %%REGa) |
3344 | 887 |
9413 | 888 :: "r" (&c->redDither), |
889 "m" (dummy), "m" (dummy), "m" (dummy), | |
890 "r" (dest), "m" (dstW) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
891 : "%"REG_a, "%"REG_d, "%"REG_S |
3344 | 892 ); |
893 } | |
6578 | 894 break; |
895 case IMGFMT_BGR24: | |
3344 | 896 { |
897 asm volatile( | |
898 YSCALEYUV2RGBX | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
899 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
900 "add %4, %%"REG_b" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
901 WRITEBGR24(%%REGb, %5, %%REGa) |
3344 | 902 |
9413 | 903 :: "r" (&c->redDither), |
904 "m" (dummy), "m" (dummy), "m" (dummy), | |
905 "r" (dest), "m" (dstW) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
906 : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx |
3344 | 907 ); |
908 } | |
6578 | 909 break; |
910 case IMGFMT_BGR15: | |
3344 | 911 { |
912 asm volatile( | |
913 YSCALEYUV2RGBX | |
914 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
915 #ifdef DITHER1XBPP | |
4248 | 916 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
917 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
918 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
3344 | 919 #endif |
920 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
921 WRITEBGR15(%4, %5, %%REGa) |
3344 | 922 |
9413 | 923 :: "r" (&c->redDither), |
924 "m" (dummy), "m" (dummy), "m" (dummy), | |
925 "r" (dest), "m" (dstW) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
926 : "%"REG_a, "%"REG_d, "%"REG_S |
3344 | 927 ); |
928 } | |
6578 | 929 break; |
930 case IMGFMT_BGR16: | |
3344 | 931 { |
932 asm volatile( | |
933 YSCALEYUV2RGBX | |
934 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
935 #ifdef DITHER1XBPP | |
4248 | 936 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
937 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
938 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
3344 | 939 #endif |
940 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
941 WRITEBGR16(%4, %5, %%REGa) |
3344 | 942 |
9413 | 943 :: "r" (&c->redDither), |
944 "m" (dummy), "m" (dummy), "m" (dummy), | |
945 "r" (dest), "m" (dstW) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
946 : "%"REG_a, "%"REG_d, "%"REG_S |
3344 | 947 ); |
948 } | |
6578 | 949 break; |
7723 | 950 case IMGFMT_YUY2: |
951 { | |
952 asm volatile( | |
953 YSCALEYUV2PACKEDX | |
954 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
955 | |
956 "psraw $3, %%mm3 \n\t" | |
957 "psraw $3, %%mm4 \n\t" | |
958 "psraw $3, %%mm1 \n\t" | |
959 "psraw $3, %%mm7 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
960 WRITEYUY2(%4, %5, %%REGa) |
7723 | 961 |
9413 | 962 :: "r" (&c->redDither), |
963 "m" (dummy), "m" (dummy), "m" (dummy), | |
964 "r" (dest), "m" (dstW) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
965 : "%"REG_a, "%"REG_d, "%"REG_S |
7723 | 966 ); |
967 } | |
968 break; | |
3344 | 969 #endif |
6578 | 970 default: |
12698 | 971 #ifdef HAVE_ALTIVEC |
17641
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
972 /* The following list of supported dstFormat values should |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
973 match what's found in the body of altivec_yuv2packedX() */ |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
974 if(c->dstFormat==IMGFMT_ABGR || c->dstFormat==IMGFMT_BGRA || |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
975 c->dstFormat==IMGFMT_BGR24 || c->dstFormat==IMGFMT_RGB24 || |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
976 c->dstFormat==IMGFMT_RGBA || c->dstFormat==IMGFMT_ARGB) |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
977 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize, |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
978 chrFilter, chrSrc, chrFilterSize, |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
979 dest, dstW, dstY); |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
980 else |
12698 | 981 #endif |
17641
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
982 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
983 chrFilter, chrSrc, chrFilterSize, |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
984 dest, dstW, dstY); |
6578 | 985 break; |
986 } | |
3344 | 987 } |
988 | |
989 /** | |
990 * vertical bilinear scale YV12 to RGB | |
991 */ | |
7723 | 992 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, |
6578 | 993 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
994 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
995 int yalpha1=yalpha^4095; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
996 int uvalpha1=uvalpha^4095; |
6578 | 997 int i; |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
998 |
11000 | 999 #if 0 //isn't used |
4467 | 1000 if(flags&SWS_FULL_CHR_H_INT) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1001 { |
6578 | 1002 switch(dstFormat) |
1003 { | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1004 #ifdef HAVE_MMX |
6578 | 1005 case IMGFMT_BGR32: |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1006 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1007 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1008 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1009 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1010 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1011 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1012 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1013 "movq %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1014 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1015 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1016 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1017 MOVNTQ(%%mm3, (%4, %%REGa, 4)) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1018 MOVNTQ(%%mm1, 8(%4, %%REGa, 4)) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1019 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1020 "add $4, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1021 "cmp %5, %%"REG_a" \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1022 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1023 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1024 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1025 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1026 "m" (yalpha1), "m" (uvalpha1) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1027 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1028 ); |
6578 | 1029 break; |
1030 case IMGFMT_BGR24: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1031 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1032 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1033 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1034 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1035 // lsb ... msb |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1036 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1037 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1038 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1039 "movq %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1040 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1041 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1042 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1043 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1044 "psrlq $8, %%mm3 \n\t" // GR0BGR00 |
4248 | 1045 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000 |
1046 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00 | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1047 "por %%mm2, %%mm3 \n\t" // BGRBGR00 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1048 "movq %%mm1, %%mm2 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1049 "psllq $48, %%mm1 \n\t" // 000000BG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1050 "por %%mm1, %%mm3 \n\t" // BGRBGRBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1051 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1052 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1053 "psrld $16, %%mm2 \n\t" // R000R000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1054 "psrlq $24, %%mm1 \n\t" // 0BGR0000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1055 "por %%mm2, %%mm1 \n\t" // RBGRR000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1056 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1057 "mov %4, %%"REG_b" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1058 "add %%"REG_a", %%"REG_b" \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1059 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1060 #ifdef HAVE_MMX2 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1061 //FIXME Alignment |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1062 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1063 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1064 #else |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1065 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1066 "psrlq $32, %%mm3 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1067 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1068 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1069 #endif |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1070 "add $4, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1071 "cmp %5, %%"REG_a" \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1072 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1073 |
3209 | 1074 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1075 "m" (yalpha1), "m" (uvalpha1) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1076 : "%"REG_a, "%"REG_b |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1077 ); |
6578 | 1078 break; |
1079 case IMGFMT_BGR15: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1080 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1081 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1082 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1083 #ifdef DITHER1XBPP |
4248 | 1084 "paddusb "MANGLE(g5Dither)", %%mm1\n\t" |
1085 "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
1086 "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1087 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1088 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1089 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1090 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1091 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1092 "psrlw $3, %%mm3 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1093 "psllw $2, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1094 "psllw $7, %%mm0 \n\t" |
4248 | 1095 "pand "MANGLE(g15Mask)", %%mm1 \n\t" |
1096 "pand "MANGLE(r15Mask)", %%mm0 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1097 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1098 "por %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1099 "por %%mm1, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1100 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1101 MOVNTQ(%%mm0, (%4, %%REGa, 2)) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1102 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1103 "add $4, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1104 "cmp %5, %%"REG_a" \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1105 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1106 |
3209 | 1107 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1108 "m" (yalpha1), "m" (uvalpha1) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1109 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1110 ); |
6578 | 1111 break; |
1112 case IMGFMT_BGR16: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1113 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1114 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1115 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1116 #ifdef DITHER1XBPP |
4248 | 1117 "paddusb "MANGLE(g6Dither)", %%mm1\n\t" |
1118 "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
1119 "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1120 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1121 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1122 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1123 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1124 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1125 "psrlw $3, %%mm3 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1126 "psllw $3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1127 "psllw $8, %%mm0 \n\t" |
4248 | 1128 "pand "MANGLE(g16Mask)", %%mm1 \n\t" |
1129 "pand "MANGLE(r16Mask)", %%mm0 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1130 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1131 "por %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1132 "por %%mm1, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1133 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1134 MOVNTQ(%%mm0, (%4, %%REGa, 2)) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1135 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1136 "add $4, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1137 "cmp %5, %%"REG_a" \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1138 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1139 |
3209 | 1140 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1141 "m" (yalpha1), "m" (uvalpha1) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1142 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1143 ); |
6578 | 1144 break; |
1145 #endif | |
1146 case IMGFMT_RGB32: | |
1147 #ifndef HAVE_MMX | |
1148 case IMGFMT_BGR32: | |
1149 #endif | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1150 if(dstFormat==IMGFMT_BGR32) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1151 { |
4794 | 1152 int i; |
4793 | 1153 #ifdef WORDS_BIGENDIAN |
1154 dest++; | |
1155 #endif | |
3209 | 1156 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1157 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1158 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1159 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1160 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
2503 | 1161 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; |
1162 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1163 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1164 dest+= 4; |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1165 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1166 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1167 else if(dstFormat==IMGFMT_BGR24) |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1168 { |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1169 int i; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1170 for(i=0;i<dstW;i++){ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1171 // vertical linear interpolation && yuv2rgb in a single step: |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1172 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1173 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1174 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1175 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1176 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1177 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1178 dest+= 3; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1179 } |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1180 } |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1181 else if(dstFormat==IMGFMT_BGR16) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1182 { |
2671 | 1183 int i; |
3209 | 1184 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1185 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1186 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1187 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1188 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1189 |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1190 ((uint16_t*)dest)[i] = |
2584 | 1191 clip_table16b[(Y + yuvtab_40cf[U]) >>13] | |
1192 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1193 clip_table16r[(Y + yuvtab_3343[V]) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1194 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1195 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1196 else if(dstFormat==IMGFMT_BGR15) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1197 { |
2671 | 1198 int i; |
3209 | 1199 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1200 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1201 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1202 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1203 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1204 |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1205 ((uint16_t*)dest)[i] = |
2584 | 1206 clip_table15b[(Y + yuvtab_40cf[U]) >>13] | |
1207 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1208 clip_table15r[(Y + yuvtab_3343[V]) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1209 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1210 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1211 }//FULL_UV_IPOL |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1212 else |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1213 { |
6578 | 1214 #endif // if 0 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1215 #ifdef HAVE_MMX |
6578 | 1216 switch(c->dstFormat) |
1217 { | |
11000 | 1218 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( |
6578 | 1219 case IMGFMT_BGR32: |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1220 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1221 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1222 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1223 YSCALEYUV2RGB(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1224 WRITEBGR32(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1225 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1226 |
9414 | 1227 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), |
1228 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1229 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1230 ); |
6578 | 1231 return; |
1232 case IMGFMT_BGR24: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1233 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1234 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1235 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1236 YSCALEYUV2RGB(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1237 WRITEBGR24(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1238 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9414 | 1239 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), |
1240 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1241 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1242 ); |
6578 | 1243 return; |
1244 case IMGFMT_BGR15: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1245 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1246 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1247 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1248 YSCALEYUV2RGB(%%REGa, %5) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1249 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1250 #ifdef DITHER1XBPP |
4248 | 1251 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1252 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1253 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1254 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1255 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1256 WRITEBGR15(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1257 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1258 |
9414 | 1259 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), |
1260 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1261 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1262 ); |
6578 | 1263 return; |
1264 case IMGFMT_BGR16: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1265 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1266 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1267 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1268 YSCALEYUV2RGB(%%REGa, %5) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1269 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1270 #ifdef DITHER1XBPP |
4248 | 1271 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1272 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1273 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1274 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1275 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1276 WRITEBGR16(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1277 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9414 | 1278 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), |
1279 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1280 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1281 ); |
6578 | 1282 return; |
7723 | 1283 case IMGFMT_YUY2: |
1284 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1285 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1286 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1287 YSCALEYUV2PACKED(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1288 WRITEYUY2(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1289 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9414 | 1290 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), |
1291 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1292 : "%"REG_a |
7723 | 1293 ); |
1294 return; | |
6578 | 1295 default: break; |
1296 } | |
1297 #endif //HAVE_MMX | |
7723 | 1298 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1299 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1300 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1301 /** |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1302 * YV12 to RGB without scaling or interpolating |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1303 */ |
7723 | 1304 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, |
6578 | 1305 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1306 { |
3344 | 1307 const int yalpha1=0; |
6578 | 1308 int i; |
1309 | |
1310 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1 | |
1311 const int yalpha= 4096; //FIXME ... | |
2671 | 1312 |
4467 | 1313 if(flags&SWS_FULL_CHR_H_INT) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1314 { |
7723 | 1315 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y); |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1316 return; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1317 } |
2576 | 1318 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1319 #ifdef HAVE_MMX |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1320 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1321 { |
6578 | 1322 switch(dstFormat) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1323 { |
6578 | 1324 case IMGFMT_BGR32: |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1325 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1326 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1327 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1328 YSCALEYUV2RGB1(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1329 WRITEBGR32(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1330 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1331 |
1332 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1333 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1334 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1335 ); |
6578 | 1336 return; |
1337 case IMGFMT_BGR24: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1338 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1339 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1340 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1341 YSCALEYUV2RGB1(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1342 WRITEBGR24(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1343 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1344 |
1345 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1346 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1347 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1348 ); |
6578 | 1349 return; |
1350 case IMGFMT_BGR15: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1351 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1352 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1353 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1354 YSCALEYUV2RGB1(%%REGa, %5) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1355 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1356 #ifdef DITHER1XBPP |
4248 | 1357 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1358 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1359 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1360 #endif |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1361 WRITEBGR15(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1362 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1363 |
1364 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1365 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1366 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1367 ); |
6578 | 1368 return; |
1369 case IMGFMT_BGR16: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1370 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1371 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1372 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1373 YSCALEYUV2RGB1(%%REGa, %5) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1374 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1375 #ifdef DITHER1XBPP |
4248 | 1376 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1377 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1378 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1379 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1380 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1381 WRITEBGR16(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1382 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1383 |
1384 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1385 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1386 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1387 ); |
6578 | 1388 return; |
7723 | 1389 case IMGFMT_YUY2: |
1390 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1391 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1392 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1393 YSCALEYUV2PACKED1(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1394 WRITEYUY2(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1395 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1396 |
1397 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1398 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1399 : "%"REG_a |
7723 | 1400 ); |
1401 return; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1402 } |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1403 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1404 else |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1405 { |
6578 | 1406 switch(dstFormat) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1407 { |
6578 | 1408 case IMGFMT_BGR32: |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1409 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1410 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1411 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1412 YSCALEYUV2RGB1b(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1413 WRITEBGR32(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1414 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1415 |
1416 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1417 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1418 : "%"REG_a |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1419 ); |
6578 | 1420 return; |
1421 case IMGFMT_BGR24: | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1422 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1423 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1424 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1425 YSCALEYUV2RGB1b(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1426 WRITEBGR24(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1427 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1428 |
1429 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1430 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1431 : "%"REG_a |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1432 ); |
6578 | 1433 return; |
1434 case IMGFMT_BGR15: | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1435 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1436 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1437 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1438 YSCALEYUV2RGB1b(%%REGa, %5) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1439 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1440 #ifdef DITHER1XBPP |
4248 | 1441 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1442 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1443 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1444 #endif |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1445 WRITEBGR15(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1446 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1447 |
1448 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1449 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1450 : "%"REG_a |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1451 ); |
6578 | 1452 return; |
1453 case IMGFMT_BGR16: | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1454 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1455 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1456 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1457 YSCALEYUV2RGB1b(%%REGa, %5) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1458 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1459 #ifdef DITHER1XBPP |
4248 | 1460 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1461 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1462 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1463 #endif |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1464 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1465 WRITEBGR16(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1466 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1467 |
1468 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1469 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1470 : "%"REG_a |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1471 ); |
6578 | 1472 return; |
7723 | 1473 case IMGFMT_YUY2: |
1474 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1475 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1476 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1477 YSCALEYUV2PACKED1b(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1478 WRITEYUY2(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1479 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1480 |
1481 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1482 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1483 : "%"REG_a |
7723 | 1484 ); |
1485 return; | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1486 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1487 } |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1488 #endif |
6578 | 1489 if( uvalpha < 2048 ) |
1490 { | |
7723 | 1491 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C) |
6578 | 1492 }else{ |
7723 | 1493 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C) |
6578 | 1494 } |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1495 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1496 |
4481 | 1497 //FIXME yuy2* can read upto 7 samples to much |
1498 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1499 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width) |
4467 | 1500 { |
4481 | 1501 #ifdef HAVE_MMX |
1502 asm volatile( | |
1503 "movq "MANGLE(bm01010101)", %%mm2\n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1504 "mov %0, %%"REG_a" \n\t" |
4481 | 1505 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1506 "movq (%1, %%"REG_a",2), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1507 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" |
4481 | 1508 "pand %%mm2, %%mm0 \n\t" |
1509 "pand %%mm2, %%mm1 \n\t" | |
1510 "packuswb %%mm1, %%mm0 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1511 "movq %%mm0, (%2, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1512 "add $8, %%"REG_a" \n\t" |
4481 | 1513 " js 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1514 : : "g" (-width), "r" (src+width*2), "r" (dst+width) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1515 : "%"REG_a |
4481 | 1516 ); |
4467 | 1517 #else |
1518 int i; | |
1519 for(i=0; i<width; i++) | |
1520 dst[i]= src[2*i]; | |
1521 #endif | |
1522 } | |
1523 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1524 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
4467 | 1525 { |
4481 | 1526 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1527 asm volatile( | |
1528 "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1529 "mov %0, %%"REG_a" \n\t" |
4481 | 1530 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1531 "movq (%1, %%"REG_a",4), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1532 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1533 "movq (%2, %%"REG_a",4), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1534 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t" |
4481 | 1535 PAVGB(%%mm2, %%mm0) |
1536 PAVGB(%%mm3, %%mm1) | |
1537 "psrlw $8, %%mm0 \n\t" | |
1538 "psrlw $8, %%mm1 \n\t" | |
1539 "packuswb %%mm1, %%mm0 \n\t" | |
1540 "movq %%mm0, %%mm1 \n\t" | |
1541 "psrlw $8, %%mm0 \n\t" | |
1542 "pand %%mm4, %%mm1 \n\t" | |
1543 "packuswb %%mm0, %%mm0 \n\t" | |
1544 "packuswb %%mm1, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1545 "movd %%mm0, (%4, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1546 "movd %%mm1, (%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1547 "add $4, %%"REG_a" \n\t" |
4481 | 1548 " js 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1549 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1550 : "%"REG_a |
4481 | 1551 ); |
4467 | 1552 #else |
1553 int i; | |
1554 for(i=0; i<width; i++) | |
1555 { | |
1556 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1; | |
1557 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1; | |
1558 } | |
1559 #endif | |
1560 } | |
1561 | |
9071 | 1562 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1563 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width) |
9071 | 1564 { |
1565 #ifdef HAVE_MMX | |
1566 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1567 "mov %0, %%"REG_a" \n\t" |
9071 | 1568 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1569 "movq (%1, %%"REG_a",2), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1570 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" |
9071 | 1571 "psrlw $8, %%mm0 \n\t" |
1572 "psrlw $8, %%mm1 \n\t" | |
1573 "packuswb %%mm1, %%mm0 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1574 "movq %%mm0, (%2, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1575 "add $8, %%"REG_a" \n\t" |
9071 | 1576 " js 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1577 : : "g" (-width), "r" (src+width*2), "r" (dst+width) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1578 : "%"REG_a |
9071 | 1579 ); |
1580 #else | |
1581 int i; | |
1582 for(i=0; i<width; i++) | |
1583 dst[i]= src[2*i+1]; | |
1584 #endif | |
1585 } | |
1586 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1587 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
9071 | 1588 { |
1589 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1590 asm volatile( | |
1591 "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1592 "mov %0, %%"REG_a" \n\t" |
9071 | 1593 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1594 "movq (%1, %%"REG_a",4), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1595 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1596 "movq (%2, %%"REG_a",4), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1597 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t" |
9071 | 1598 PAVGB(%%mm2, %%mm0) |
1599 PAVGB(%%mm3, %%mm1) | |
1600 "pand %%mm4, %%mm0 \n\t" | |
1601 "pand %%mm4, %%mm1 \n\t" | |
1602 "packuswb %%mm1, %%mm0 \n\t" | |
1603 "movq %%mm0, %%mm1 \n\t" | |
1604 "psrlw $8, %%mm0 \n\t" | |
1605 "pand %%mm4, %%mm1 \n\t" | |
1606 "packuswb %%mm0, %%mm0 \n\t" | |
1607 "packuswb %%mm1, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1608 "movd %%mm0, (%4, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1609 "movd %%mm1, (%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1610 "add $4, %%"REG_a" \n\t" |
9071 | 1611 " js 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1612 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1613 : "%"REG_a |
9071 | 1614 ); |
1615 #else | |
1616 int i; | |
1617 for(i=0; i<width; i++) | |
1618 { | |
1619 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1; | |
1620 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1; | |
1621 } | |
1622 #endif | |
1623 } | |
1624 | |
4467 | 1625 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width) |
1626 { | |
1627 int i; | |
1628 for(i=0; i<width; i++) | |
1629 { | |
9433 | 1630 int b= ((uint32_t*)src)[i]&0xFF; |
1631 int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
9499 | 1632 int r= (((uint32_t*)src)[i]>>16)&0xFF; |
4467 | 1633 |
9433 | 1634 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); |
4467 | 1635 } |
1636 } | |
1637 | |
1638 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1639 { | |
1640 int i; | |
1641 for(i=0; i<width; i++) | |
1642 { | |
9433 | 1643 const int a= ((uint32_t*)src1)[2*i+0]; |
1644 const int e= ((uint32_t*)src1)[2*i+1]; | |
1645 const int c= ((uint32_t*)src2)[2*i+0]; | |
1646 const int d= ((uint32_t*)src2)[2*i+1]; | |
1647 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF); | |
1648 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00); | |
1649 const int b= l&0x3FF; | |
1650 const int g= h>>8; | |
1651 const int r= l>>16; | |
4467 | 1652 |
1653 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1654 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1655 } | |
1656 } | |
1657 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1658 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width) |
4467 | 1659 { |
4612 | 1660 #ifdef HAVE_MMX |
1661 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1662 "mov %2, %%"REG_a" \n\t" |
4923 | 1663 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" |
1664 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
4612 | 1665 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1666 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
1667 ASMALIGN16 |
4612 | 1668 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1669 PREFETCH" 64(%0, %%"REG_b") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1670 "movd (%0, %%"REG_b"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1671 "movd 3(%0, %%"REG_b"), %%mm1 \n\t" |
4612 | 1672 "punpcklbw %%mm7, %%mm0 \n\t" |
1673 "punpcklbw %%mm7, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1674 "movd 6(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1675 "movd 9(%0, %%"REG_b"), %%mm3 \n\t" |
4612 | 1676 "punpcklbw %%mm7, %%mm2 \n\t" |
1677 "punpcklbw %%mm7, %%mm3 \n\t" | |
1678 "pmaddwd %%mm6, %%mm0 \n\t" | |
1679 "pmaddwd %%mm6, %%mm1 \n\t" | |
1680 "pmaddwd %%mm6, %%mm2 \n\t" | |
1681 "pmaddwd %%mm6, %%mm3 \n\t" | |
1682 #ifndef FAST_BGR2YV12 | |
1683 "psrad $8, %%mm0 \n\t" | |
1684 "psrad $8, %%mm1 \n\t" | |
1685 "psrad $8, %%mm2 \n\t" | |
1686 "psrad $8, %%mm3 \n\t" | |
1687 #endif | |
1688 "packssdw %%mm1, %%mm0 \n\t" | |
1689 "packssdw %%mm3, %%mm2 \n\t" | |
1690 "pmaddwd %%mm5, %%mm0 \n\t" | |
1691 "pmaddwd %%mm5, %%mm2 \n\t" | |
1692 "packssdw %%mm2, %%mm0 \n\t" | |
1693 "psraw $7, %%mm0 \n\t" | |
1694 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1695 "movd 12(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1696 "movd 15(%0, %%"REG_b"), %%mm1 \n\t" |
4612 | 1697 "punpcklbw %%mm7, %%mm4 \n\t" |
1698 "punpcklbw %%mm7, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1699 "movd 18(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1700 "movd 21(%0, %%"REG_b"), %%mm3 \n\t" |
4612 | 1701 "punpcklbw %%mm7, %%mm2 \n\t" |
1702 "punpcklbw %%mm7, %%mm3 \n\t" | |
1703 "pmaddwd %%mm6, %%mm4 \n\t" | |
1704 "pmaddwd %%mm6, %%mm1 \n\t" | |
1705 "pmaddwd %%mm6, %%mm2 \n\t" | |
1706 "pmaddwd %%mm6, %%mm3 \n\t" | |
1707 #ifndef FAST_BGR2YV12 | |
1708 "psrad $8, %%mm4 \n\t" | |
1709 "psrad $8, %%mm1 \n\t" | |
1710 "psrad $8, %%mm2 \n\t" | |
1711 "psrad $8, %%mm3 \n\t" | |
1712 #endif | |
1713 "packssdw %%mm1, %%mm4 \n\t" | |
1714 "packssdw %%mm3, %%mm2 \n\t" | |
1715 "pmaddwd %%mm5, %%mm4 \n\t" | |
1716 "pmaddwd %%mm5, %%mm2 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1717 "add $24, %%"REG_b" \n\t" |
4612 | 1718 "packssdw %%mm2, %%mm4 \n\t" |
1719 "psraw $7, %%mm4 \n\t" | |
1720 | |
1721 "packuswb %%mm4, %%mm0 \n\t" | |
4923 | 1722 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" |
4612 | 1723 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1724 "movq %%mm0, (%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1725 "add $8, %%"REG_a" \n\t" |
4612 | 1726 " js 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1727 : : "r" (src+width*3), "r" (dst+width), "g" (-width) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1728 : "%"REG_a, "%"REG_b |
4612 | 1729 ); |
4467 | 1730 #else |
1731 int i; | |
1732 for(i=0; i<width; i++) | |
1733 { | |
1734 int b= src[i*3+0]; | |
1735 int g= src[i*3+1]; | |
1736 int r= src[i*3+2]; | |
1737 | |
9434 | 1738 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); |
4467 | 1739 } |
1740 #endif | |
1741 } | |
1742 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1743 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
4467 | 1744 { |
4619 | 1745 #ifdef HAVE_MMX |
1746 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1747 "mov %4, %%"REG_a" \n\t" |
4923 | 1748 "movq "MANGLE(w1111)", %%mm5 \n\t" |
1749 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" | |
4619 | 1750 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1751 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1752 "add %%"REG_b", %%"REG_b" \n\t" |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
1753 ASMALIGN16 |
4619 | 1754 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1755 PREFETCH" 64(%0, %%"REG_b") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1756 PREFETCH" 64(%1, %%"REG_b") \n\t" |
4619 | 1757 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1758 "movq (%0, %%"REG_b"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1759 "movq (%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1760 "movq 6(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1761 "movq 6(%1, %%"REG_b"), %%mm3 \n\t" |
4619 | 1762 PAVGB(%%mm1, %%mm0) |
1763 PAVGB(%%mm3, %%mm2) | |
1764 "movq %%mm0, %%mm1 \n\t" | |
1765 "movq %%mm2, %%mm3 \n\t" | |
1766 "psrlq $24, %%mm0 \n\t" | |
1767 "psrlq $24, %%mm2 \n\t" | |
1768 PAVGB(%%mm1, %%mm0) | |
1769 PAVGB(%%mm3, %%mm2) | |
1770 "punpcklbw %%mm7, %%mm0 \n\t" | |
1771 "punpcklbw %%mm7, %%mm2 \n\t" | |
1772 #else | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1773 "movd (%0, %%"REG_b"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1774 "movd (%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1775 "movd 3(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1776 "movd 3(%1, %%"REG_b"), %%mm3 \n\t" |
4619 | 1777 "punpcklbw %%mm7, %%mm0 \n\t" |
1778 "punpcklbw %%mm7, %%mm1 \n\t" | |
1779 "punpcklbw %%mm7, %%mm2 \n\t" | |
1780 "punpcklbw %%mm7, %%mm3 \n\t" | |
1781 "paddw %%mm1, %%mm0 \n\t" | |
1782 "paddw %%mm3, %%mm2 \n\t" | |
1783 "paddw %%mm2, %%mm0 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1784 "movd 6(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1785 "movd 6(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1786 "movd 9(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1787 "movd 9(%1, %%"REG_b"), %%mm3 \n\t" |
4619 | 1788 "punpcklbw %%mm7, %%mm4 \n\t" |
1789 "punpcklbw %%mm7, %%mm1 \n\t" | |
1790 "punpcklbw %%mm7, %%mm2 \n\t" | |
1791 "punpcklbw %%mm7, %%mm3 \n\t" | |
1792 "paddw %%mm1, %%mm4 \n\t" | |
1793 "paddw %%mm3, %%mm2 \n\t" | |
1794 "paddw %%mm4, %%mm2 \n\t" | |
1795 "psrlw $2, %%mm0 \n\t" | |
1796 "psrlw $2, %%mm2 \n\t" | |
1797 #endif | |
4923 | 1798 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
1799 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4619 | 1800 |
1801 "pmaddwd %%mm0, %%mm1 \n\t" | |
1802 "pmaddwd %%mm2, %%mm3 \n\t" | |
1803 "pmaddwd %%mm6, %%mm0 \n\t" | |
1804 "pmaddwd %%mm6, %%mm2 \n\t" | |
1805 #ifndef FAST_BGR2YV12 | |
1806 "psrad $8, %%mm0 \n\t" | |
1807 "psrad $8, %%mm1 \n\t" | |
1808 "psrad $8, %%mm2 \n\t" | |
1809 "psrad $8, %%mm3 \n\t" | |
1810 #endif | |
1811 "packssdw %%mm2, %%mm0 \n\t" | |
1812 "packssdw %%mm3, %%mm1 \n\t" | |
1813 "pmaddwd %%mm5, %%mm0 \n\t" | |
1814 "pmaddwd %%mm5, %%mm1 \n\t" | |
1815 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
1816 "psraw $7, %%mm0 \n\t" | |
1817 | |
1818 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1819 "movq 12(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1820 "movq 12(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1821 "movq 18(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1822 "movq 18(%1, %%"REG_b"), %%mm3 \n\t" |
4619 | 1823 PAVGB(%%mm1, %%mm4) |
1824 PAVGB(%%mm3, %%mm2) | |
1825 "movq %%mm4, %%mm1 \n\t" | |
1826 "movq %%mm2, %%mm3 \n\t" | |
1827 "psrlq $24, %%mm4 \n\t" | |
1828 "psrlq $24, %%mm2 \n\t" | |
1829 PAVGB(%%mm1, %%mm4) | |
1830 PAVGB(%%mm3, %%mm2) | |
1831 "punpcklbw %%mm7, %%mm4 \n\t" | |
1832 "punpcklbw %%mm7, %%mm2 \n\t" | |
1833 #else | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1834 "movd 12(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1835 "movd 12(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1836 "movd 15(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1837 "movd 15(%1, %%"REG_b"), %%mm3 \n\t" |
4619 | 1838 "punpcklbw %%mm7, %%mm4 \n\t" |
1839 "punpcklbw %%mm7, %%mm1 \n\t" | |
1840 "punpcklbw %%mm7, %%mm2 \n\t" | |
1841 "punpcklbw %%mm7, %%mm3 \n\t" | |
1842 "paddw %%mm1, %%mm4 \n\t" | |
1843 "paddw %%mm3, %%mm2 \n\t" | |
1844 "paddw %%mm2, %%mm4 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1845 "movd 18(%0, %%"REG_b"), %%mm5 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1846 "movd 18(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1847 "movd 21(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1848 "movd 21(%1, %%"REG_b"), %%mm3 \n\t" |
4619 | 1849 "punpcklbw %%mm7, %%mm5 \n\t" |
1850 "punpcklbw %%mm7, %%mm1 \n\t" | |
1851 "punpcklbw %%mm7, %%mm2 \n\t" | |
1852 "punpcklbw %%mm7, %%mm3 \n\t" | |
1853 "paddw %%mm1, %%mm5 \n\t" | |
1854 "paddw %%mm3, %%mm2 \n\t" | |
1855 "paddw %%mm5, %%mm2 \n\t" | |
4923 | 1856 "movq "MANGLE(w1111)", %%mm5 \n\t" |
4619 | 1857 "psrlw $2, %%mm4 \n\t" |
1858 "psrlw $2, %%mm2 \n\t" | |
1859 #endif | |
4923 | 1860 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
1861 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4619 | 1862 |
1863 "pmaddwd %%mm4, %%mm1 \n\t" | |
1864 "pmaddwd %%mm2, %%mm3 \n\t" | |
1865 "pmaddwd %%mm6, %%mm4 \n\t" | |
1866 "pmaddwd %%mm6, %%mm2 \n\t" | |
1867 #ifndef FAST_BGR2YV12 | |
1868 "psrad $8, %%mm4 \n\t" | |
1869 "psrad $8, %%mm1 \n\t" | |
1870 "psrad $8, %%mm2 \n\t" | |
1871 "psrad $8, %%mm3 \n\t" | |
1872 #endif | |
1873 "packssdw %%mm2, %%mm4 \n\t" | |
1874 "packssdw %%mm3, %%mm1 \n\t" | |
1875 "pmaddwd %%mm5, %%mm4 \n\t" | |
1876 "pmaddwd %%mm5, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1877 "add $24, %%"REG_b" \n\t" |
4619 | 1878 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 |
1879 "psraw $7, %%mm4 \n\t" | |
1880 | |
1881 "movq %%mm0, %%mm1 \n\t" | |
1882 "punpckldq %%mm4, %%mm0 \n\t" | |
1883 "punpckhdq %%mm4, %%mm1 \n\t" | |
1884 "packsswb %%mm1, %%mm0 \n\t" | |
4923 | 1885 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" |
4619 | 1886 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1887 "movd %%mm0, (%2, %%"REG_a") \n\t" |
4619 | 1888 "punpckhdq %%mm0, %%mm0 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1889 "movd %%mm0, (%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1890 "add $4, %%"REG_a" \n\t" |
4619 | 1891 " js 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1892 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1893 : "%"REG_a, "%"REG_b |
4619 | 1894 ); |
4467 | 1895 #else |
1896 int i; | |
1897 for(i=0; i<width; i++) | |
1898 { | |
1899 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3]; | |
1900 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4]; | |
1901 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5]; | |
1902 | |
1903 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1904 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1905 } | |
1906 #endif | |
1907 } | |
1908 | |
4578 | 1909 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width) |
1910 { | |
1911 int i; | |
1912 for(i=0; i<width; i++) | |
1913 { | |
9433 | 1914 int d= ((uint16_t*)src)[i]; |
4578 | 1915 int b= d&0x1F; |
1916 int g= (d>>5)&0x3F; | |
1917 int r= (d>>11)&0x1F; | |
1918 | |
1919 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16; | |
1920 } | |
1921 } | |
1922 | |
1923 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1924 { | |
1925 int i; | |
1926 for(i=0; i<width; i++) | |
1927 { | |
9433 | 1928 int d0= ((uint32_t*)src1)[i]; |
1929 int d1= ((uint32_t*)src2)[i]; | |
4579 | 1930 |
1931 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F); | |
1932 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F); | |
1933 | |
1934 int dh2= (dh>>11) + (dh<<21); | |
1935 int d= dh2 + dl; | |
1936 | |
1937 int b= d&0x7F; | |
1938 int r= (d>>11)&0x7F; | |
1939 int g= d>>21; | |
4578 | 1940 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128; |
1941 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128; | |
1942 } | |
1943 } | |
1944 | |
4580 | 1945 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width) |
1946 { | |
1947 int i; | |
1948 for(i=0; i<width; i++) | |
1949 { | |
9433 | 1950 int d= ((uint16_t*)src)[i]; |
4580 | 1951 int b= d&0x1F; |
1952 int g= (d>>5)&0x1F; | |
1953 int r= (d>>10)&0x1F; | |
1954 | |
1955 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16; | |
1956 } | |
1957 } | |
1958 | |
1959 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1960 { | |
1961 int i; | |
1962 for(i=0; i<width; i++) | |
1963 { | |
9433 | 1964 int d0= ((uint32_t*)src1)[i]; |
1965 int d1= ((uint32_t*)src2)[i]; | |
4580 | 1966 |
1967 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F); | |
1968 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F); | |
1969 | |
1970 int dh2= (dh>>11) + (dh<<21); | |
1971 int d= dh2 + dl; | |
1972 | |
1973 int b= d&0x7F; | |
1974 int r= (d>>10)&0x7F; | |
1975 int g= d>>21; | |
1976 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128; | |
1977 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128; | |
1978 } | |
1979 } | |
1980 | |
1981 | |
4558 | 1982 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width) |
1983 { | |
1984 int i; | |
1985 for(i=0; i<width; i++) | |
1986 { | |
9433 | 1987 int r= ((uint32_t*)src)[i]&0xFF; |
1988 int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
9499 | 1989 int b= (((uint32_t*)src)[i]>>16)&0xFF; |
4558 | 1990 |
9433 | 1991 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); |
4558 | 1992 } |
1993 } | |
1994 | |
1995 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1996 { | |
1997 int i; | |
1998 for(i=0; i<width; i++) | |
1999 { | |
9433 | 2000 const int a= ((uint32_t*)src1)[2*i+0]; |
2001 const int e= ((uint32_t*)src1)[2*i+1]; | |
2002 const int c= ((uint32_t*)src2)[2*i+0]; | |
2003 const int d= ((uint32_t*)src2)[2*i+1]; | |
2004 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF); | |
2005 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00); | |
2006 const int r= l&0x3FF; | |
2007 const int g= h>>8; | |
2008 const int b= l>>16; | |
4558 | 2009 |
2010 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2011 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2012 } | |
2013 } | |
2014 | |
2015 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width) | |
2016 { | |
2017 int i; | |
2018 for(i=0; i<width; i++) | |
2019 { | |
2020 int r= src[i*3+0]; | |
2021 int g= src[i*3+1]; | |
2022 int b= src[i*3+2]; | |
2023 | |
9433 | 2024 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); |
4558 | 2025 } |
2026 } | |
2027 | |
2028 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2029 { | |
2030 int i; | |
2031 for(i=0; i<width; i++) | |
2032 { | |
2033 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3]; | |
2034 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4]; | |
2035 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5]; | |
2036 | |
2037 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2038 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2039 } | |
2040 } | |
2041 | |
4467 | 2042 |
3272 | 2043 // Bilinear / Bicubic scaling |
2044 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
2045 int16_t *filter, int16_t *filterPos, long filterSize) |
3272 | 2046 { |
2047 #ifdef HAVE_MMX | |
9921
61057de81510
mplayer idependant (not really yet) swscale example
michael
parents:
9499
diff
changeset
|
2048 assert(filterSize % 4 == 0 && filterSize>0); |
3272 | 2049 if(filterSize==4) // allways true for upscaling, sometimes for down too |
2050 { | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2051 long counter= -2*dstW; |
3272 | 2052 filter-= counter*2; |
2053 filterPos-= counter/2; | |
2054 dst-= counter/2; | |
2055 asm volatile( | |
2056 "pxor %%mm7, %%mm7 \n\t" | |
4248 | 2057 "movq "MANGLE(w02)", %%mm6 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2058 "push %%"REG_BP" \n\t" // we use 7 regs here ... |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2059 "mov %%"REG_a", %%"REG_BP" \n\t" |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
2060 ASMALIGN16 |
3272 | 2061 "1: \n\t" |
13733
c45cf718dfe8
10000l : fix a crash on x86 due to an horrible mistake in my x86_64 patch
aurel
parents:
13720
diff
changeset
|
2062 "movzwl (%2, %%"REG_BP"), %%eax \n\t" |
c45cf718dfe8
10000l : fix a crash on x86 due to an horrible mistake in my x86_64 patch
aurel
parents:
13720
diff
changeset
|
2063 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2064 "movq (%1, %%"REG_BP", 4), %%mm1\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2065 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2066 "movd (%3, %%"REG_a"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2067 "movd (%3, %%"REG_b"), %%mm2 \n\t" |
3272 | 2068 "punpcklbw %%mm7, %%mm0 \n\t" |
2069 "punpcklbw %%mm7, %%mm2 \n\t" | |
2070 "pmaddwd %%mm1, %%mm0 \n\t" | |
2071 "pmaddwd %%mm2, %%mm3 \n\t" | |
2072 "psrad $8, %%mm0 \n\t" | |
2073 "psrad $8, %%mm3 \n\t" | |
2074 "packssdw %%mm3, %%mm0 \n\t" | |
2075 "pmaddwd %%mm6, %%mm0 \n\t" | |
2076 "packssdw %%mm0, %%mm0 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2077 "movd %%mm0, (%4, %%"REG_BP") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2078 "add $4, %%"REG_BP" \n\t" |
3272 | 2079 " jnc 1b \n\t" |
3352 | 2080 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2081 "pop %%"REG_BP" \n\t" |
3272 | 2082 : "+a" (counter) |
2083 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2084 : "%"REG_b |
3272 | 2085 ); |
2086 } | |
2087 else if(filterSize==8) | |
2088 { | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2089 long counter= -2*dstW; |
3272 | 2090 filter-= counter*4; |
2091 filterPos-= counter/2; | |
2092 dst-= counter/2; | |
2093 asm volatile( | |
2094 "pxor %%mm7, %%mm7 \n\t" | |
4248 | 2095 "movq "MANGLE(w02)", %%mm6 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2096 "push %%"REG_BP" \n\t" // we use 7 regs here ... |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2097 "mov %%"REG_a", %%"REG_BP" \n\t" |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
2098 ASMALIGN16 |
3272 | 2099 "1: \n\t" |
13733
c45cf718dfe8
10000l : fix a crash on x86 due to an horrible mistake in my x86_64 patch
aurel
parents:
13720
diff
changeset
|
2100 "movzwl (%2, %%"REG_BP"), %%eax \n\t" |
c45cf718dfe8
10000l : fix a crash on x86 due to an horrible mistake in my x86_64 patch
aurel
parents:
13720
diff
changeset
|
2101 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2102 "movq (%1, %%"REG_BP", 8), %%mm1\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2103 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2104 "movd (%3, %%"REG_a"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2105 "movd (%3, %%"REG_b"), %%mm2 \n\t" |
3272 | 2106 "punpcklbw %%mm7, %%mm0 \n\t" |
2107 "punpcklbw %%mm7, %%mm2 \n\t" | |
2108 "pmaddwd %%mm1, %%mm0 \n\t" | |
2109 "pmaddwd %%mm2, %%mm3 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
2110 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2111 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2112 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2113 "movd 4(%3, %%"REG_a"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2114 "movd 4(%3, %%"REG_b"), %%mm2 \n\t" |
3272 | 2115 "punpcklbw %%mm7, %%mm4 \n\t" |
2116 "punpcklbw %%mm7, %%mm2 \n\t" | |
2117 "pmaddwd %%mm1, %%mm4 \n\t" | |
2118 "pmaddwd %%mm2, %%mm5 \n\t" | |
2119 "paddd %%mm4, %%mm0 \n\t" | |
2120 "paddd %%mm5, %%mm3 \n\t" | |
2121 | |
2122 "psrad $8, %%mm0 \n\t" | |
2123 "psrad $8, %%mm3 \n\t" | |
2124 "packssdw %%mm3, %%mm0 \n\t" | |
2125 "pmaddwd %%mm6, %%mm0 \n\t" | |
2126 "packssdw %%mm0, %%mm0 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2127 "movd %%mm0, (%4, %%"REG_BP") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2128 "add $4, %%"REG_BP" \n\t" |
3272 | 2129 " jnc 1b \n\t" |
3344 | 2130 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2131 "pop %%"REG_BP" \n\t" |
3272 | 2132 : "+a" (counter) |
2133 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2134 : "%"REG_b |
3272 | 2135 ); |
2136 } | |
2137 else | |
2138 { | |
15617
130dd060f723
one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents:
15295
diff
changeset
|
2139 uint8_t *offset = src+filterSize; |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2140 long counter= -2*dstW; |
3272 | 2141 // filter-= counter*filterSize/2; |
2142 filterPos-= counter/2; | |
2143 dst-= counter/2; | |
2144 asm volatile( | |
2145 "pxor %%mm7, %%mm7 \n\t" | |
4248 | 2146 "movq "MANGLE(w02)", %%mm6 \n\t" |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
2147 ASMALIGN16 |
3272 | 2148 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2149 "mov %2, %%"REG_c" \n\t" |
13733
c45cf718dfe8
10000l : fix a crash on x86 due to an horrible mistake in my x86_64 patch
aurel
parents:
13720
diff
changeset
|
2150 "movzwl (%%"REG_c", %0), %%eax \n\t" |
c45cf718dfe8
10000l : fix a crash on x86 due to an horrible mistake in my x86_64 patch
aurel
parents:
13720
diff
changeset
|
2151 "movzwl 2(%%"REG_c", %0), %%ebx \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2152 "mov %5, %%"REG_c" \n\t" |
3272 | 2153 "pxor %%mm4, %%mm4 \n\t" |
2154 "pxor %%mm5, %%mm5 \n\t" | |
2155 "2: \n\t" | |
2156 "movq (%1), %%mm1 \n\t" | |
2157 "movq (%1, %6), %%mm3 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2158 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2159 "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t" |
3272 | 2160 "punpcklbw %%mm7, %%mm0 \n\t" |
2161 "punpcklbw %%mm7, %%mm2 \n\t" | |
2162 "pmaddwd %%mm1, %%mm0 \n\t" | |
2163 "pmaddwd %%mm2, %%mm3 \n\t" | |
2164 "paddd %%mm3, %%mm5 \n\t" | |
2165 "paddd %%mm0, %%mm4 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2166 "add $8, %1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2167 "add $4, %%"REG_c" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2168 "cmp %4, %%"REG_c" \n\t" |
3272 | 2169 " jb 2b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2170 "add %6, %1 \n\t" |
3272 | 2171 "psrad $8, %%mm4 \n\t" |
2172 "psrad $8, %%mm5 \n\t" | |
2173 "packssdw %%mm5, %%mm4 \n\t" | |
2174 "pmaddwd %%mm6, %%mm4 \n\t" | |
2175 "packssdw %%mm4, %%mm4 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2176 "mov %3, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2177 "movd %%mm4, (%%"REG_a", %0) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2178 "add $4, %0 \n\t" |
3272 | 2179 " jnc 1b \n\t" |
3344 | 2180 |
3641 | 2181 : "+r" (counter), "+r" (filter) |
15617
130dd060f723
one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents:
15295
diff
changeset
|
2182 : "m" (filterPos), "m" (dst), "m"(offset), |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
2183 "m" (src), "r" (filterSize*2) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2184 : "%"REG_b, "%"REG_a, "%"REG_c |
3272 | 2185 ); |
2186 } | |
2187 #else | |
12130
2ef24558b732
AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michael
parents:
12017
diff
changeset
|
2188 #ifdef HAVE_ALTIVEC |
2ef24558b732
AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michael
parents:
12017
diff
changeset
|
2189 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize); |
2ef24558b732
AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michael
parents:
12017
diff
changeset
|
2190 #else |
3272 | 2191 int i; |
2192 for(i=0; i<dstW; i++) | |
2193 { | |
2194 int j; | |
2195 int srcPos= filterPos[i]; | |
2196 int val=0; | |
3344 | 2197 // printf("filterPos: %d\n", filterPos[i]); |
3272 | 2198 for(j=0; j<filterSize; j++) |
2199 { | |
2200 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); | |
2201 val += ((int)src[srcPos + j])*filter[filterSize*i + j]; | |
2202 } | |
2203 // filter += hFilterSize; | |
2204 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ... | |
2205 // dst[i] = val>>7; | |
2206 } | |
2207 #endif | |
12130
2ef24558b732
AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michael
parents:
12017
diff
changeset
|
2208 #endif |
3272 | 2209 } |
2210 // *** horizontal scale Y line to temp buffer | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2211 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc, |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2212 int flags, int canMMX2BeUsed, int16_t *hLumFilter, |
4467 | 2213 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, |
5452 | 2214 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, |
2215 int32_t *mmx2FilterPos) | |
2469 | 2216 { |
4467 | 2217 if(srcFormat==IMGFMT_YUY2) |
2218 { | |
2219 RENAME(yuy2ToY)(formatConvBuffer, src, srcW); | |
2220 src= formatConvBuffer; | |
2221 } | |
9071 | 2222 else if(srcFormat==IMGFMT_UYVY) |
2223 { | |
2224 RENAME(uyvyToY)(formatConvBuffer, src, srcW); | |
2225 src= formatConvBuffer; | |
2226 } | |
4467 | 2227 else if(srcFormat==IMGFMT_BGR32) |
2228 { | |
2229 RENAME(bgr32ToY)(formatConvBuffer, src, srcW); | |
2230 src= formatConvBuffer; | |
2231 } | |
2232 else if(srcFormat==IMGFMT_BGR24) | |
2233 { | |
2234 RENAME(bgr24ToY)(formatConvBuffer, src, srcW); | |
2235 src= formatConvBuffer; | |
2236 } | |
4578 | 2237 else if(srcFormat==IMGFMT_BGR16) |
2238 { | |
2239 RENAME(bgr16ToY)(formatConvBuffer, src, srcW); | |
2240 src= formatConvBuffer; | |
2241 } | |
4580 | 2242 else if(srcFormat==IMGFMT_BGR15) |
2243 { | |
2244 RENAME(bgr15ToY)(formatConvBuffer, src, srcW); | |
2245 src= formatConvBuffer; | |
2246 } | |
4558 | 2247 else if(srcFormat==IMGFMT_RGB32) |
2248 { | |
2249 RENAME(rgb32ToY)(formatConvBuffer, src, srcW); | |
2250 src= formatConvBuffer; | |
2251 } | |
2252 else if(srcFormat==IMGFMT_RGB24) | |
2253 { | |
2254 RENAME(rgb24ToY)(formatConvBuffer, src, srcW); | |
2255 src= formatConvBuffer; | |
2256 } | |
4467 | 2257 |
3352 | 2258 #ifdef HAVE_MMX |
11000 | 2259 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one) |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2260 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) |
3352 | 2261 #else |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2262 if(!(flags&SWS_FAST_BILINEAR)) |
3352 | 2263 #endif |
3272 | 2264 { |
2265 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); | |
2266 } | |
2267 else // Fast Bilinear upscale / crap downscale | |
2268 { | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2269 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
2469 | 2270 #ifdef HAVE_MMX2 |
2671 | 2271 int i; |
2469 | 2272 if(canMMX2BeUsed) |
2273 { | |
2274 asm volatile( | |
2275 "pxor %%mm7, %%mm7 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2276 "mov %0, %%"REG_c" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2277 "mov %1, %%"REG_D" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2278 "mov %2, %%"REG_d" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2279 "mov %3, %%"REG_b" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2280 "xor %%"REG_a", %%"REG_a" \n\t" // i |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2281 PREFETCH" (%%"REG_c") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2282 PREFETCH" 32(%%"REG_c") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2283 PREFETCH" 64(%%"REG_c") \n\t" |
2520 | 2284 |
14556 | 2285 #ifdef ARCH_X86_64 |
2286 | |
2469 | 2287 #define FUNNY_Y_CODE \ |
14556 | 2288 "movl (%%"REG_b"), %%esi \n\t"\ |
5452 | 2289 "call *%4 \n\t"\ |
14556 | 2290 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\ |
2291 "add %%"REG_S", %%"REG_c" \n\t"\ | |
14536
6f13379b1464
100l, fix broken AMD64 patch. To whoever applied it: Did you actually _try_
reimar
parents:
13733
diff
changeset
|
2292 "add %%"REG_a", %%"REG_D" \n\t"\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2293 "xor %%"REG_a", %%"REG_a" \n\t"\ |
2520 | 2294 |
14556 | 2295 #else |
2296 | |
2297 #define FUNNY_Y_CODE \ | |
2298 "movl (%%"REG_b"), %%esi \n\t"\ | |
2299 "call *%4 \n\t"\ | |
2300 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\ | |
2301 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2302 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2303 | |
2304 #endif | |
2305 | |
2469 | 2306 FUNNY_Y_CODE |
2307 FUNNY_Y_CODE | |
2308 FUNNY_Y_CODE | |
2309 FUNNY_Y_CODE | |
2310 FUNNY_Y_CODE | |
2311 FUNNY_Y_CODE | |
2312 FUNNY_Y_CODE | |
2313 FUNNY_Y_CODE | |
2314 | |
5452 | 2315 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2316 "m" (funnyYCode) | |
14536
6f13379b1464
100l, fix broken AMD64 patch. To whoever applied it: Did you actually _try_
reimar
parents:
13733
diff
changeset
|
2317 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D |
2469 | 2318 ); |
3215 | 2319 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; |
2469 | 2320 } |
2321 else | |
2322 { | |
2323 #endif | |
15617
130dd060f723
one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents:
15295
diff
changeset
|
2324 int xInc_shr16 = xInc >> 16; |
130dd060f723
one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents:
15295
diff
changeset
|
2325 int xInc_mask = xInc & 0xffff; |
2469 | 2326 //NO MMX just normal asm ... |
2327 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2328 "xor %%"REG_a", %%"REG_a" \n\t" // i |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2329 "xor %%"REG_b", %%"REG_b" \n\t" // xx |
2469 | 2330 "xorl %%ecx, %%ecx \n\t" // 2*xalpha |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
2331 ASMALIGN16 |
2469 | 2332 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2333 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx] |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2334 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1] |
2469 | 2335 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2336 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2337 "shll $16, %%edi \n\t" | |
2338 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2339 "mov %1, %%"REG_D" \n\t" |
2469 | 2340 "shrl $9, %%esi \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2341 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t" |
2469 | 2342 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2343 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry |
2469 | 2344 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2345 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx] |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2346 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1] |
2469 | 2347 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2348 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2349 "shll $16, %%edi \n\t" | |
2350 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2351 "mov %1, %%"REG_D" \n\t" |
2469 | 2352 "shrl $9, %%esi \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2353 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t" |
2469 | 2354 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2355 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry |
2469 | 2356 |
2357 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2358 "add $2, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2359 "cmp %2, %%"REG_a" \n\t" |
2469 | 2360 " jb 1b \n\t" |
2361 | |
2362 | |
15617
130dd060f723
one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents:
15295
diff
changeset
|
2363 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2364 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi" |
2469 | 2365 ); |
2366 #ifdef HAVE_MMX2 | |
11000 | 2367 } //if MMX2 can't be used |
2469 | 2368 #endif |
2369 #else | |
2671 | 2370 int i; |
2371 unsigned int xpos=0; | |
2372 for(i=0;i<dstWidth;i++) | |
2373 { | |
2374 register unsigned int xx=xpos>>16; | |
2375 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2376 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; | |
2377 xpos+=xInc; | |
2378 } | |
2469 | 2379 #endif |
3272 | 2380 } |
2469 | 2381 } |
2382 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
2383 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2, |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2384 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, |
4467 | 2385 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, |
5452 | 2386 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, |
2387 int32_t *mmx2FilterPos) | |
2469 | 2388 { |
4467 | 2389 if(srcFormat==IMGFMT_YUY2) |
2390 { | |
2391 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2392 src1= formatConvBuffer; | |
2393 src2= formatConvBuffer+2048; | |
2394 } | |
9071 | 2395 else if(srcFormat==IMGFMT_UYVY) |
2396 { | |
2397 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2398 src1= formatConvBuffer; | |
2399 src2= formatConvBuffer+2048; | |
2400 } | |
4467 | 2401 else if(srcFormat==IMGFMT_BGR32) |
2402 { | |
2403 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2404 src1= formatConvBuffer; | |
2405 src2= formatConvBuffer+2048; | |
2406 } | |
2407 else if(srcFormat==IMGFMT_BGR24) | |
2408 { | |
2409 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2410 src1= formatConvBuffer; | |
2411 src2= formatConvBuffer+2048; | |
2412 } | |
4578 | 2413 else if(srcFormat==IMGFMT_BGR16) |
2414 { | |
2415 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2416 src1= formatConvBuffer; | |
2417 src2= formatConvBuffer+2048; | |
2418 } | |
4580 | 2419 else if(srcFormat==IMGFMT_BGR15) |
2420 { | |
2421 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2422 src1= formatConvBuffer; | |
2423 src2= formatConvBuffer+2048; | |
2424 } | |
4558 | 2425 else if(srcFormat==IMGFMT_RGB32) |
2426 { | |
2427 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2428 src1= formatConvBuffer; | |
2429 src2= formatConvBuffer+2048; | |
2430 } | |
2431 else if(srcFormat==IMGFMT_RGB24) | |
2432 { | |
2433 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2434 src1= formatConvBuffer; | |
2435 src2= formatConvBuffer+2048; | |
2436 } | |
4481 | 2437 else if(isGray(srcFormat)) |
2438 { | |
2439 return; | |
2440 } | |
4467 | 2441 |
3352 | 2442 #ifdef HAVE_MMX |
11000 | 2443 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one) |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2444 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) |
3352 | 2445 #else |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2446 if(!(flags&SWS_FAST_BILINEAR)) |
3352 | 2447 #endif |
3272 | 2448 { |
2449 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2450 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2451 } | |
2452 else // Fast Bilinear upscale / crap downscale | |
2453 { | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2454 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
2469 | 2455 #ifdef HAVE_MMX2 |
2671 | 2456 int i; |
2469 | 2457 if(canMMX2BeUsed) |
2458 { | |
2459 asm volatile( | |
5452 | 2460 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2461 "mov %0, %%"REG_c" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2462 "mov %1, %%"REG_D" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2463 "mov %2, %%"REG_d" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2464 "mov %3, %%"REG_b" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2465 "xor %%"REG_a", %%"REG_a" \n\t" // i |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2466 PREFETCH" (%%"REG_c") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2467 PREFETCH" 32(%%"REG_c") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2468 PREFETCH" 64(%%"REG_c") \n\t" |
5452 | 2469 |
14556 | 2470 #ifdef ARCH_X86_64 |
2471 | |
5452 | 2472 #define FUNNY_UV_CODE \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2473 "movl (%%"REG_b"), %%esi \n\t"\ |
5452 | 2474 "call *%4 \n\t"\ |
14556 | 2475 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\ |
2476 "add %%"REG_S", %%"REG_c" \n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2477 "add %%"REG_a", %%"REG_D" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2478 "xor %%"REG_a", %%"REG_a" \n\t"\ |
2469 | 2479 |
14556 | 2480 #else |
2481 | |
2482 #define FUNNY_UV_CODE \ | |
2483 "movl (%%"REG_b"), %%esi \n\t"\ | |
2484 "call *%4 \n\t"\ | |
2485 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\ | |
2486 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2487 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2488 | |
2489 #endif | |
2490 | |
5452 | 2491 FUNNY_UV_CODE |
2492 FUNNY_UV_CODE | |
2493 FUNNY_UV_CODE | |
2494 FUNNY_UV_CODE | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2495 "xor %%"REG_a", %%"REG_a" \n\t" // i |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2496 "mov %5, %%"REG_c" \n\t" // src |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2497 "mov %1, %%"REG_D" \n\t" // buf1 |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2498 "add $4096, %%"REG_D" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2499 PREFETCH" (%%"REG_c") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2500 PREFETCH" 32(%%"REG_c") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2501 PREFETCH" 64(%%"REG_c") \n\t" |
2469 | 2502 |
5452 | 2503 FUNNY_UV_CODE |
2504 FUNNY_UV_CODE | |
2505 FUNNY_UV_CODE | |
2506 FUNNY_UV_CODE | |
2469 | 2507 |
5452 | 2508 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2509 "m" (funnyUVCode), "m" (src2) | |
14556 | 2510 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D |
5452 | 2511 ); |
3344 | 2512 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) |
2469 | 2513 { |
3344 | 2514 // printf("%d %d %d\n", dstWidth, i, srcW); |
2515 dst[i] = src1[srcW-1]*128; | |
2516 dst[i+2048] = src2[srcW-1]*128; | |
2469 | 2517 } |
2518 } | |
2519 else | |
2520 { | |
2521 #endif | |
15617
130dd060f723
one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents:
15295
diff
changeset
|
2522 long xInc_shr16 = (long) (xInc >> 16); |
130dd060f723
one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents:
15295
diff
changeset
|
2523 int xInc_mask = xInc & 0xffff; |
2469 | 2524 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2525 "xor %%"REG_a", %%"REG_a" \n\t" // i |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2526 "xor %%"REG_b", %%"REG_b" \n\t" // xx |
2469 | 2527 "xorl %%ecx, %%ecx \n\t" // 2*xalpha |
18104
7b408d60de9e
add support for intel mac. mp3lib is not fixed yet.
nplourde
parents:
17641
diff
changeset
|
2528 ASMALIGN16 |
2469 | 2529 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2530 "mov %0, %%"REG_S" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2531 "movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx] |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2532 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi \n\t" //src[xx+1] |
2469 | 2533 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2534 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2535 "shll $16, %%edi \n\t" | |
2536 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2537 "mov %1, %%"REG_D" \n\t" |
2469 | 2538 "shrl $9, %%esi \n\t" |
15845 | 2539 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t" |
2469 | 2540 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2541 "movzbl (%5, %%"REG_b"), %%edi \n\t" //src[xx] |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2542 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1] |
2469 | 2543 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2544 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2545 "shll $16, %%edi \n\t" | |
2546 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2547 "mov %1, %%"REG_D" \n\t" |
2469 | 2548 "shrl $9, %%esi \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2549 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t" |
2469 | 2550 |
2551 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2552 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2553 "add $1, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2554 "cmp %2, %%"REG_a" \n\t" |
2469 | 2555 " jb 1b \n\t" |
2556 | |
15972
e4360060b79a
Re-enables the GCC-4 fix for AMD-64 only. Patch by cartman and poirierg
gpoirier
parents:
15858
diff
changeset
|
2557 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, |
e4360060b79a
Re-enables the GCC-4 fix for AMD-64 only. Patch by cartman and poirierg
gpoirier
parents:
15858
diff
changeset
|
2558 which is needed to support GCC-4.0 */ |
e4360060b79a
Re-enables the GCC-4 fix for AMD-64 only. Patch by cartman and poirierg
gpoirier
parents:
15858
diff
changeset
|
2559 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4)) |
e4360060b79a
Re-enables the GCC-4 fix for AMD-64 only. Patch by cartman and poirierg
gpoirier
parents:
15858
diff
changeset
|
2560 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), |
e4360060b79a
Re-enables the GCC-4 fix for AMD-64 only. Patch by cartman and poirierg
gpoirier
parents:
15858
diff
changeset
|
2561 #else |
15858
045f91e5e67d
Reverts GCC-4.0 "fixe" which broke GCC-3.3 and maybe others
gpoirier
parents:
15845
diff
changeset
|
2562 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), |
15972
e4360060b79a
Re-enables the GCC-4 fix for AMD-64 only. Patch by cartman and poirierg
gpoirier
parents:
15858
diff
changeset
|
2563 #endif |
2469 | 2564 "r" (src2) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2565 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi" |
2469 | 2566 ); |
2567 #ifdef HAVE_MMX2 | |
11000 | 2568 } //if MMX2 can't be used |
2469 | 2569 #endif |
2570 #else | |
2671 | 2571 int i; |
2572 unsigned int xpos=0; | |
2573 for(i=0;i<dstWidth;i++) | |
2574 { | |
2575 register unsigned int xx=xpos>>16; | |
2576 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2577 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |
2578 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | |
2566 | 2579 /* slower |
2580 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; | |
2581 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; | |
2582 */ | |
2671 | 2583 xpos+=xInc; |
2584 } | |
2469 | 2585 #endif |
3272 | 2586 } |
2587 } | |
2588 | |
9499 | 2589 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
2590 int srcSliceH, uint8_t* dst[], int dstStride[]){ | |
3344 | 2591 |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2592 /* load a few things into local vars to make the code more readable? and faster */ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2593 const int srcW= c->srcW; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2594 const int dstW= c->dstW; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2595 const int dstH= c->dstH; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2596 const int chrDstW= c->chrDstW; |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2597 const int chrSrcW= c->chrSrcW; |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2598 const int lumXInc= c->lumXInc; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2599 const int chrXInc= c->chrXInc; |
4295 | 2600 const int dstFormat= c->dstFormat; |
6503 | 2601 const int srcFormat= c->srcFormat; |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2602 const int flags= c->flags; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2603 const int canMMX2BeUsed= c->canMMX2BeUsed; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2604 int16_t *vLumFilterPos= c->vLumFilterPos; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2605 int16_t *vChrFilterPos= c->vChrFilterPos; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2606 int16_t *hLumFilterPos= c->hLumFilterPos; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2607 int16_t *hChrFilterPos= c->hChrFilterPos; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2608 int16_t *vLumFilter= c->vLumFilter; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2609 int16_t *vChrFilter= c->vChrFilter; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2610 int16_t *hLumFilter= c->hLumFilter; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2611 int16_t *hChrFilter= c->hChrFilter; |
9413 | 2612 int32_t *lumMmxFilter= c->lumMmxFilter; |
2613 int32_t *chrMmxFilter= c->chrMmxFilter; | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2614 const int vLumFilterSize= c->vLumFilterSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2615 const int vChrFilterSize= c->vChrFilterSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2616 const int hLumFilterSize= c->hLumFilterSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2617 const int hChrFilterSize= c->hChrFilterSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2618 int16_t **lumPixBuf= c->lumPixBuf; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2619 int16_t **chrPixBuf= c->chrPixBuf; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2620 const int vLumBufSize= c->vLumBufSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2621 const int vChrBufSize= c->vChrBufSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2622 uint8_t *funnyYCode= c->funnyYCode; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2623 uint8_t *funnyUVCode= c->funnyUVCode; |
4467 | 2624 uint8_t *formatConvBuffer= c->formatConvBuffer; |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2625 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample; |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2626 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample); |
9494
543ab3909b78
sws_ prefix, more seperation between internal & external swscaler API
michael
parents:
9476
diff
changeset
|
2627 int lastDstY; |
3344 | 2628 |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2629 /* vars whch will change and which we need to storw back in the context */ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2630 int dstY= c->dstY; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2631 int lumBufIndex= c->lumBufIndex; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2632 int chrBufIndex= c->chrBufIndex; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2633 int lastInLumBuf= c->lastInLumBuf; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2634 int lastInChrBuf= c->lastInChrBuf; |
6540 | 2635 |
2636 if(isPacked(c->srcFormat)){ | |
4467 | 2637 src[0]= |
2638 src[1]= | |
9499 | 2639 src[2]= src[0]; |
6540 | 2640 srcStride[0]= |
4467 | 2641 srcStride[1]= |
9499 | 2642 srcStride[2]= srcStride[0]; |
4467 | 2643 } |
6540 | 2644 srcStride[1]<<= c->vChrDrop; |
2645 srcStride[2]<<= c->vChrDrop; | |
4419 | 2646 |
6517 | 2647 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2], |
2648 // (int)dst[0], (int)dst[1], (int)dst[2]); | |
2649 | |
2650 #if 0 //self test FIXME move to a vfilter or something | |
2651 { | |
2652 static volatile int i=0; | |
2653 i++; | |
2654 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH) | |
2655 selfTest(src, srcStride, c->srcW, c->srcH); | |
2656 i--; | |
2657 } | |
2658 #endif | |
4554 | 2659 |
2660 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2], | |
2661 //dstStride[0],dstStride[1],dstStride[2]); | |
4419 | 2662 |
2663 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0) | |
2664 { | |
2665 static int firstTime=1; //FIXME move this into the context perhaps | |
2666 if(flags & SWS_PRINT_INFO && firstTime) | |
2667 { | |
9970 | 2668 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n" |
4419 | 2669 "SwScaler: ->cannot do aligned memory acesses anymore\n"); |
2670 firstTime=0; | |
2671 } | |
2672 } | |
3344 | 2673 |
4467 | 2674 /* Note the user might start scaling the picture in the middle so this will not get executed |
2675 this is not really intended but works currently, so ppl might do it */ | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2676 if(srcSliceY ==0){ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2677 lumBufIndex=0; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2678 chrBufIndex=0; |
4467 | 2679 dstY=0; |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2680 lastInLumBuf= -1; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2681 lastInChrBuf= -1; |
3272 | 2682 } |
3344 | 2683 |
9494
543ab3909b78
sws_ prefix, more seperation between internal & external swscaler API
michael
parents:
9476
diff
changeset
|
2684 lastDstY= dstY; |
543ab3909b78
sws_ prefix, more seperation between internal & external swscaler API
michael
parents:
9476
diff
changeset
|
2685 |
3344 | 2686 for(;dstY < dstH; dstY++){ |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2687 unsigned char *dest =dst[0]+dstStride[0]*dstY; |
6520 | 2688 const int chrDstY= dstY>>c->chrDstVSubSample; |
2689 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY; | |
2690 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY; | |
3344 | 2691 |
2692 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input | |
2693 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input | |
2694 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input | |
2695 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input | |
2696 | |
11122 | 2697 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n", |
2698 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample); | |
4290
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2699 //handle holes (FAST_BILINEAR & weird filters) |
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2700 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; |
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2701 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; |
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2702 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize); |
3344 | 2703 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1) |
2704 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1) | |
2216 | 2705 |
3344 | 2706 // Do we have enough lines in this slice to output the dstY line |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2707 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample)) |
2469 | 2708 { |
3344 | 2709 //Do horizontal scaling |
2710 while(lastInLumBuf < lastLumSrcY) | |
2711 { | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2712 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; |
3344 | 2713 lumBufIndex++; |
4290
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2714 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY); |
3344 | 2715 ASSERT(lumBufIndex < 2*vLumBufSize) |
2716 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
2717 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
2718 // printf("%d %d\n", lumBufIndex, vLumBufSize); | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2719 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2720 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, |
5452 | 2721 funnyYCode, c->srcFormat, formatConvBuffer, |
2722 c->lumMmx2Filter, c->lumMmx2FilterPos); | |
3344 | 2723 lastInLumBuf++; |
2724 } | |
2725 while(lastInChrBuf < lastChrSrcY) | |
2726 { | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2727 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2728 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; |
3344 | 2729 chrBufIndex++; |
2730 ASSERT(chrBufIndex < 2*vChrBufSize) | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2731 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)) |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2732 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0) |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2733 //FIXME replace parameters through context struct (some at least) |
6503 | 2734 |
2735 if(!(isGray(srcFormat) || isGray(dstFormat))) | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2736 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2737 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, |
5452 | 2738 funnyUVCode, c->srcFormat, formatConvBuffer, |
2739 c->chrMmx2Filter, c->chrMmx2FilterPos); | |
3344 | 2740 lastInChrBuf++; |
2741 } | |
2742 //wrap buf index around to stay inside the ring buffer | |
2743 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
2744 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
2469 | 2745 } |
3344 | 2746 else // not enough lines left in this slice -> load the rest in the buffer |
2469 | 2747 { |
3344 | 2748 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n", |
2749 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY, | |
2750 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize, | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2751 vChrBufSize, vLumBufSize);*/ |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2752 |
3344 | 2753 //Do horizontal scaling |
2754 while(lastInLumBuf+1 < srcSliceY + srcSliceH) | |
2469 | 2755 { |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2756 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; |
3344 | 2757 lumBufIndex++; |
2758 ASSERT(lumBufIndex < 2*vLumBufSize) | |
2759 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
2760 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2761 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2762 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, |
5452 | 2763 funnyYCode, c->srcFormat, formatConvBuffer, |
2764 c->lumMmx2Filter, c->lumMmx2FilterPos); | |
3344 | 2765 lastInLumBuf++; |
2469 | 2766 } |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2767 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH)) |
3344 | 2768 { |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2769 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2770 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; |
3344 | 2771 chrBufIndex++; |
2772 ASSERT(chrBufIndex < 2*vChrBufSize) | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2773 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH) |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2774 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0) |
6503 | 2775 |
2776 if(!(isGray(srcFormat) || isGray(dstFormat))) | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2777 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2778 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, |
5452 | 2779 funnyUVCode, c->srcFormat, formatConvBuffer, |
2780 c->chrMmx2Filter, c->chrMmx2FilterPos); | |
3344 | 2781 lastInChrBuf++; |
2782 } | |
2783 //wrap buf index around to stay inside the ring buffer | |
2784 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
2785 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
11000 | 2786 break; //we can't output a dstY line so let's try with the next slice |
2469 | 2787 } |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
2788 |
2748 | 2789 #ifdef HAVE_MMX |
3344 | 2790 b5Dither= dither8[dstY&1]; |
2791 g6Dither= dither4[dstY&1]; | |
2792 g5Dither= dither8[dstY&1]; | |
2793 r5Dither= dither8[(dstY+1)&1]; | |
2748 | 2794 #endif |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2795 if(dstY < dstH-2) |
3352 | 2796 { |
9414 | 2797 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; |
2798 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
2799 #ifdef HAVE_MMX | |
2800 int i; | |
2801 for(i=0; i<vLumFilterSize; i++) | |
2802 { | |
2803 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i]; | |
2804 lumMmxFilter[4*i+2]= | |
2805 lumMmxFilter[4*i+3]= | |
2806 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; | |
2807 } | |
2808 for(i=0; i<vChrFilterSize; i++) | |
2809 { | |
2810 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i]; | |
2811 chrMmxFilter[4*i+2]= | |
2812 chrMmxFilter[4*i+3]= | |
2813 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; | |
2814 } | |
2815 #endif | |
14715 | 2816 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){ |
2817 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
2818 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi | |
2819 RENAME(yuv2nv12X)(c, | |
2820 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2821 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2822 dest, uDest, dstW, chrDstW, dstFormat); | |
2823 } | |
2824 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like | |
3344 | 2825 { |
7351 | 2826 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; |
2827 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
3344 | 2828 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12 |
2829 { | |
2830 int16_t *lumBuf = lumPixBuf[0]; | |
2831 int16_t *chrBuf= chrPixBuf[0]; | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2832 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW); |
3344 | 2833 } |
2834 else //General YV12 | |
2835 { | |
9413 | 2836 RENAME(yuv2yuvX)(c, |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2837 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2838 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, |
9414 | 2839 dest, uDest, vDest, dstW, chrDstW); |
3344 | 2840 } |
2841 } | |
2842 else | |
2843 { | |
2844 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
2845 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
2846 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB | |
2847 { | |
2848 int chrAlpha= vChrFilter[2*dstY+1]; | |
7723 | 2849 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), |
6578 | 2850 dest, dstW, chrAlpha, dstFormat, flags, dstY); |
3344 | 2851 } |
2852 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB | |
2853 { | |
2854 int lumAlpha= vLumFilter[2*dstY+1]; | |
2855 int chrAlpha= vChrFilter[2*dstY+1]; | |
7723 | 2856 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), |
6578 | 2857 dest, dstW, lumAlpha, chrAlpha, dstY); |
3344 | 2858 } |
2859 else //General RGB | |
2860 { | |
7723 | 2861 RENAME(yuv2packedX)(c, |
3344 | 2862 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, |
2863 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
9413 | 2864 dest, dstW, dstY); |
3344 | 2865 } |
2866 } | |
3352 | 2867 } |
11000 | 2868 else // hmm looks like we can't use MMX here without overwriting this array's tail |
3352 | 2869 { |
2870 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
2871 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
14715 | 2872 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){ |
2873 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
2874 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi | |
2875 yuv2nv12XinC( | |
2876 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2877 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2878 dest, uDest, dstW, chrDstW, dstFormat); | |
2879 } | |
2880 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 | |
3352 | 2881 { |
7351 | 2882 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; |
2883 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
6540 | 2884 yuv2yuvXinC( |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2885 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2886 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, |
6540 | 2887 dest, uDest, vDest, dstW, chrDstW); |
3352 | 2888 } |
2889 else | |
2890 { | |
2891 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
2892 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
7723 | 2893 yuv2packedXinC(c, |
3352 | 2894 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, |
2895 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
6578 | 2896 dest, dstW, dstY); |
3352 | 2897 } |
2898 } | |
3344 | 2899 } |
2534
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2900 |
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2901 #ifdef HAVE_MMX |
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2902 __asm __volatile(SFENCE:::"memory"); |
2566 | 2903 __asm __volatile(EMMS:::"memory"); |
2534
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2904 #endif |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2905 /* store changed local vars back in the context */ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2906 c->dstY= dstY; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2907 c->lumBufIndex= lumBufIndex; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2908 c->chrBufIndex= chrBufIndex; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2909 c->lastInLumBuf= lastInLumBuf; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2910 c->lastInChrBuf= lastInChrBuf; |
9494
543ab3909b78
sws_ prefix, more seperation between internal & external swscaler API
michael
parents:
9476
diff
changeset
|
2911 |
543ab3909b78
sws_ prefix, more seperation between internal & external swscaler API
michael
parents:
9476
diff
changeset
|
2912 return dstY - lastDstY; |
3641 | 2913 } |