Mercurial > mplayer.hg
annotate postproc/swscale_template.c @ 17819:7d5716fd634c
init sh_a (new audio stream) to sh_audio (current audio stream); closes cid 241
author | nicodvb |
---|---|
date | Sun, 12 Mar 2006 11:12:48 +0000 |
parents | fbf94ea858f1 |
children | 7b408d60de9e |
rev | line source |
---|---|
4295 | 1 /* |
9476
eff727517e6b
yuv2rgb brightness/contrast/saturation/different colorspaces support finished
michael
parents:
9434
diff
changeset
|
2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> |
2216 | 3 |
4295 | 4 This program is free software; you can redistribute it and/or modify |
5 it under the terms of the GNU General Public License as published by | |
6 the Free Software Foundation; either version 2 of the License, or | |
7 (at your option) any later version. | |
2216 | 8 |
4295 | 9 This program is distributed in the hope that it will be useful, |
10 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 GNU General Public License for more details. | |
13 | |
14 You should have received a copy of the GNU General Public License | |
15 along with this program; if not, write to the Free Software | |
17367
401b440a6d76
Update licensing information: The FSF changed postal address.
diego
parents:
16739
diff
changeset
|
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
4295 | 17 */ |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
18 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
19 #undef REAL_MOVNTQ |
2540 | 20 #undef MOVNTQ |
2680 | 21 #undef PAVGB |
3136 | 22 #undef PREFETCH |
23 #undef PREFETCHW | |
24 #undef EMMS | |
25 #undef SFENCE | |
26 | |
27 #ifdef HAVE_3DNOW | |
28 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
29 #define EMMS "femms" | |
30 #else | |
31 #define EMMS "emms" | |
32 #endif | |
33 | |
34 #ifdef HAVE_3DNOW | |
35 #define PREFETCH "prefetch" | |
36 #define PREFETCHW "prefetchw" | |
37 #elif defined ( HAVE_MMX2 ) | |
38 #define PREFETCH "prefetchnta" | |
39 #define PREFETCHW "prefetcht0" | |
40 #else | |
41 #define PREFETCH "/nop" | |
42 #define PREFETCHW "/nop" | |
43 #endif | |
44 | |
45 #ifdef HAVE_MMX2 | |
46 #define SFENCE "sfence" | |
47 #else | |
48 #define SFENCE "/nop" | |
49 #endif | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
50 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
51 #ifdef HAVE_MMX2 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
52 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
53 #elif defined (HAVE_3DNOW) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
54 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
55 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
56 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
57 #ifdef HAVE_MMX2 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
58 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
59 #else |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
60 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
61 #endif |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
62 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
63 |
12017
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
64 #ifdef HAVE_ALTIVEC |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
65 #include "swscale_altivec_template.c" |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
66 #endif |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
67 |
9413 | 68 #define YSCALEYUV2YV12X(x, offset) \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
69 "xor %%"REG_a", %%"REG_a" \n\t"\ |
11122 | 70 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ |
71 "movq %%mm3, %%mm4 \n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
72 "lea " offset "(%0), %%"REG_d" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
73 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
3344 | 74 ".balign 16 \n\t" /* FIXME Unroll? */\ |
75 "1: \n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
76 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
77 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
78 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
79 "add $16, %%"REG_d" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
80 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
81 "test %%"REG_S", %%"REG_S" \n\t"\ |
3344 | 82 "pmulhw %%mm0, %%mm2 \n\t"\ |
83 "pmulhw %%mm0, %%mm5 \n\t"\ | |
84 "paddw %%mm2, %%mm3 \n\t"\ | |
85 "paddw %%mm5, %%mm4 \n\t"\ | |
86 " jnz 1b \n\t"\ | |
87 "psraw $3, %%mm3 \n\t"\ | |
88 "psraw $3, %%mm4 \n\t"\ | |
89 "packuswb %%mm4, %%mm3 \n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
90 MOVNTQ(%%mm3, (%1, %%REGa))\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
91 "add $8, %%"REG_a" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
92 "cmp %2, %%"REG_a" \n\t"\ |
11122 | 93 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ |
94 "movq %%mm3, %%mm4 \n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
95 "lea " offset "(%0), %%"REG_d" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
96 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
3344 | 97 "jb 1b \n\t" |
98 | |
99 #define YSCALEYUV2YV121 \ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
100 "mov %2, %%"REG_a" \n\t"\ |
3344 | 101 ".balign 16 \n\t" /* FIXME Unroll? */\ |
102 "1: \n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
103 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
104 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\ |
3344 | 105 "psraw $7, %%mm0 \n\t"\ |
106 "psraw $7, %%mm1 \n\t"\ | |
107 "packuswb %%mm1, %%mm0 \n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
108 MOVNTQ(%%mm0, (%1, %%REGa))\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
109 "add $8, %%"REG_a" \n\t"\ |
3344 | 110 "jnc 1b \n\t" |
111 | |
112 /* | |
113 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
114 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
115 "r" (dest), "m" (dstW), | |
116 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
117 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
118 */ | |
7723 | 119 #define YSCALEYUV2PACKEDX \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
120 "xor %%"REG_a", %%"REG_a" \n\t"\ |
3344 | 121 ".balign 16 \n\t"\ |
9413 | 122 "nop \n\t"\ |
3344 | 123 "1: \n\t"\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
124 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
125 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
11122 | 126 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\ |
127 "movq %%mm3, %%mm4 \n\t"\ | |
9413 | 128 ".balign 16 \n\t"\ |
3344 | 129 "2: \n\t"\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
130 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
131 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
132 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
133 "add $16, %%"REG_d" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
134 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
3344 | 135 "pmulhw %%mm0, %%mm2 \n\t"\ |
136 "pmulhw %%mm0, %%mm5 \n\t"\ | |
137 "paddw %%mm2, %%mm3 \n\t"\ | |
138 "paddw %%mm5, %%mm4 \n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
139 "test %%"REG_S", %%"REG_S" \n\t"\ |
3344 | 140 " jnz 2b \n\t"\ |
141 \ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
142 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
143 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
11122 | 144 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\ |
145 "movq %%mm1, %%mm7 \n\t"\ | |
9413 | 146 ".balign 16 \n\t"\ |
3344 | 147 "2: \n\t"\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
148 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
149 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
150 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
151 "add $16, %%"REG_d" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
152 "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
3344 | 153 "pmulhw %%mm0, %%mm2 \n\t"\ |
154 "pmulhw %%mm0, %%mm5 \n\t"\ | |
155 "paddw %%mm2, %%mm1 \n\t"\ | |
156 "paddw %%mm5, %%mm7 \n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
157 "test %%"REG_S", %%"REG_S" \n\t"\ |
3344 | 158 " jnz 2b \n\t"\ |
7723 | 159 |
160 | |
161 #define YSCALEYUV2RGBX \ | |
162 YSCALEYUV2PACKEDX\ | |
9413 | 163 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ |
164 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ | |
3344 | 165 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
166 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
9413 | 167 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ |
168 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ | |
3344 | 169 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
9413 | 170 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ |
171 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ | |
172 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ | |
173 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ | |
174 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ | |
175 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ | |
3344 | 176 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
177 "paddw %%mm3, %%mm4 \n\t"\ | |
178 "movq %%mm2, %%mm0 \n\t"\ | |
179 "movq %%mm5, %%mm6 \n\t"\ | |
180 "movq %%mm4, %%mm3 \n\t"\ | |
181 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
182 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
183 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
184 "paddw %%mm1, %%mm2 \n\t"\ | |
185 "paddw %%mm1, %%mm5 \n\t"\ | |
186 "paddw %%mm1, %%mm4 \n\t"\ | |
187 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
188 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
189 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
190 "paddw %%mm7, %%mm0 \n\t"\ | |
191 "paddw %%mm7, %%mm6 \n\t"\ | |
192 "paddw %%mm7, %%mm3 \n\t"\ | |
193 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
194 "packuswb %%mm0, %%mm2 \n\t"\ | |
195 "packuswb %%mm6, %%mm5 \n\t"\ | |
196 "packuswb %%mm3, %%mm4 \n\t"\ | |
197 "pxor %%mm7, %%mm7 \n\t" | |
9413 | 198 #if 0 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
199 #define FULL_YSCALEYUV2RGB \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
200 "pxor %%mm7, %%mm7 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
201 "movd %6, %%mm6 \n\t" /*yalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
202 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
203 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
204 "movd %7, %%mm5 \n\t" /*uvalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
205 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
206 "punpcklwd %%mm5, %%mm5 \n\t"\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
207 "xor %%"REG_a", %%"REG_a" \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
208 ".balign 16 \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
209 "1: \n\t"\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
210 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
211 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
212 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
213 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
214 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
215 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
216 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
217 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
218 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
219 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
220 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
221 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
222 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
223 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
224 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ |
4248 | 225 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\ |
226 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\ | |
227 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
228 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
229 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
230 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
231 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
4248 | 232 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
233 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
4248 | 234 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
235 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
4248 | 236 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
237 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
238 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
239 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ |
4248 | 240 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\ |
241 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
242 "paddw %%mm1, %%mm3 \n\t" /* B*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
243 "paddw %%mm1, %%mm0 \n\t" /* R*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
244 "packuswb %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
245 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
246 "packuswb %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
247 "paddw %%mm4, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
248 "paddw %%mm2, %%mm1 \n\t" /* G*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
249 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
250 "packuswb %%mm1, %%mm1 \n\t" |
9413 | 251 #endif |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
252 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
253 #define REAL_YSCALEYUV2PACKED(index, c) \ |
9414 | 254 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ |
255 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\ | |
256 "psraw $3, %%mm0 \n\t"\ | |
257 "psraw $3, %%mm1 \n\t"\ | |
258 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\ | |
259 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
260 "xor "#index", "#index" \n\t"\ |
7723 | 261 ".balign 16 \n\t"\ |
262 "1: \n\t"\ | |
9414 | 263 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
264 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
265 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
266 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
7723 | 267 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
268 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |
9414 | 269 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ |
7723 | 270 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
271 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |
272 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | |
273 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | |
274 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |
275 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |
9414 | 276 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ |
277 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
278 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ | |
279 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ | |
7723 | 280 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
281 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | |
9414 | 282 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
283 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
7723 | 284 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
285 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | |
286 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
287 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |
288 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
289 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
290 |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
291 #define REAL_YSCALEYUV2RGB(index, c) \ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
292 "xor "#index", "#index" \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
293 ".balign 16 \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
294 "1: \n\t"\ |
9414 | 295 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
296 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
297 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\ | |
298 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
299 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
300 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ |
9414 | 301 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
302 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
303 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
304 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
305 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
306 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
307 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
9414 | 308 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
309 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
310 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
311 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
9414 | 312 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ |
313 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
314 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
9414 | 315 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ |
316 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | |
317 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\ | |
318 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
319 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
320 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ |
9414 | 321 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
322 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
323 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
324 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
325 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
326 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
9414 | 327 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ |
328 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
329 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
330 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
331 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
332 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
333 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
334 "paddw %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
335 "movq %%mm2, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
336 "movq %%mm5, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
337 "movq %%mm4, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
338 "punpcklwd %%mm2, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
339 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
340 "punpcklwd %%mm4, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
341 "paddw %%mm1, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
342 "paddw %%mm1, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
343 "paddw %%mm1, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
344 "punpckhwd %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
345 "punpckhwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
346 "punpckhwd %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
347 "paddw %%mm7, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
348 "paddw %%mm7, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
349 "paddw %%mm7, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
350 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
351 "packuswb %%mm0, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
352 "packuswb %%mm6, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
353 "packuswb %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
354 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
355 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c) |
7723 | 356 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
357 #define REAL_YSCALEYUV2PACKED1(index, c) \ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
358 "xor "#index", "#index" \n\t"\ |
7723 | 359 ".balign 16 \n\t"\ |
360 "1: \n\t"\ | |
9417 | 361 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ |
362 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
7723 | 363 "psraw $7, %%mm3 \n\t" \ |
364 "psraw $7, %%mm4 \n\t" \ | |
9417 | 365 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
366 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
7723 | 367 "psraw $7, %%mm1 \n\t" \ |
368 "psraw $7, %%mm7 \n\t" \ | |
369 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
370 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
371 |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
372 #define REAL_YSCALEYUV2RGB1(index, c) \ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
373 "xor "#index", "#index" \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
374 ".balign 16 \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
375 "1: \n\t"\ |
9417 | 376 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ |
377 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
378 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
379 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
9417 | 380 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
381 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
382 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
383 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
9417 | 384 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ |
385 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
386 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
9417 | 387 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
388 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
389 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
390 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
9417 | 391 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ |
392 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
393 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
394 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
395 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
396 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
397 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
398 "paddw %%mm3, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
399 "movq %%mm2, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
400 "movq %%mm5, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
401 "movq %%mm4, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
402 "punpcklwd %%mm2, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
403 "punpcklwd %%mm5, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
404 "punpcklwd %%mm4, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
405 "paddw %%mm1, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
406 "paddw %%mm1, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
407 "paddw %%mm1, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
408 "punpckhwd %%mm0, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
409 "punpckhwd %%mm6, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
410 "punpckhwd %%mm3, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
411 "paddw %%mm7, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
412 "paddw %%mm7, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
413 "paddw %%mm7, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
414 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
415 "packuswb %%mm0, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
416 "packuswb %%mm6, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
417 "packuswb %%mm3, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
418 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
419 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
420 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
421 #define REAL_YSCALEYUV2PACKED1b(index, c) \ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
422 "xor "#index", "#index" \n\t"\ |
7723 | 423 ".balign 16 \n\t"\ |
424 "1: \n\t"\ | |
9417 | 425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
427 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
428 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
7723 | 429 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
430 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
431 "psrlw $8, %%mm3 \n\t" \ | |
432 "psrlw $8, %%mm4 \n\t" \ | |
9417 | 433 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
434 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
7723 | 435 "psraw $7, %%mm1 \n\t" \ |
436 "psraw $7, %%mm7 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
437 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) |
7723 | 438 |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
439 // do vertical chrominance interpolation |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
440 #define REAL_YSCALEYUV2RGB1b(index, c) \ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
441 "xor "#index", "#index" \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
442 ".balign 16 \n\t"\ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
443 "1: \n\t"\ |
9417 | 444 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
445 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |
446 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | |
447 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | |
2576 | 448 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
449 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
3344 | 450 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ |
451 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ | |
9417 | 452 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
453 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
454 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
455 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
9417 | 456 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\ |
457 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
458 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
9417 | 459 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
460 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
461 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
462 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
9417 | 463 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\ |
464 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\ | |
465 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | |
466 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | |
467 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | |
468 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
469 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
470 "paddw %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
471 "movq %%mm2, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
472 "movq %%mm5, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
473 "movq %%mm4, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
474 "punpcklwd %%mm2, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
475 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
476 "punpcklwd %%mm4, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
477 "paddw %%mm1, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
478 "paddw %%mm1, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
479 "paddw %%mm1, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
480 "punpckhwd %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
481 "punpckhwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
482 "punpckhwd %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
483 "paddw %%mm7, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
484 "paddw %%mm7, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
485 "paddw %%mm7, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
486 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
487 "packuswb %%mm0, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
488 "packuswb %%mm6, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
489 "packuswb %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
490 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
491 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
492 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
493 #define REAL_WRITEBGR32(dst, dstw, index) \ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
494 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
495 "movq %%mm2, %%mm1 \n\t" /* B */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
496 "movq %%mm5, %%mm6 \n\t" /* R */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
497 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
498 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
499 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
500 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
501 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
502 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
503 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
504 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
505 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
506 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
507 \ |
9414 | 508 MOVNTQ(%%mm0, (dst, index, 4))\ |
509 MOVNTQ(%%mm2, 8(dst, index, 4))\ | |
510 MOVNTQ(%%mm1, 16(dst, index, 4))\ | |
511 MOVNTQ(%%mm3, 24(dst, index, 4))\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
512 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
513 "add $8, "#index" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
514 "cmp "#dstw", "#index" \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
515 " jb 1b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
516 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
517 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
518 #define REAL_WRITEBGR16(dst, dstw, index) \ |
4248 | 519 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
520 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ | |
521 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
2669 | 522 "psrlq $3, %%mm2 \n\t"\ |
523 \ | |
524 "movq %%mm2, %%mm1 \n\t"\ | |
525 "movq %%mm4, %%mm3 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
526 \ |
2669 | 527 "punpcklbw %%mm7, %%mm3 \n\t"\ |
528 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
529 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
530 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
531 \ |
2669 | 532 "psllq $3, %%mm3 \n\t"\ |
533 "psllq $3, %%mm4 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
534 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
535 "por %%mm3, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
536 "por %%mm4, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
537 \ |
9414 | 538 MOVNTQ(%%mm2, (dst, index, 2))\ |
539 MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
540 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
541 "add $8, "#index" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
542 "cmp "#dstw", "#index" \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
543 " jb 1b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
544 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
545 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
546 #define REAL_WRITEBGR15(dst, dstw, index) \ |
4248 | 547 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
548 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ | |
549 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | |
2669 | 550 "psrlq $3, %%mm2 \n\t"\ |
551 "psrlq $1, %%mm5 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
552 \ |
2669 | 553 "movq %%mm2, %%mm1 \n\t"\ |
554 "movq %%mm4, %%mm3 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
555 \ |
2669 | 556 "punpcklbw %%mm7, %%mm3 \n\t"\ |
557 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
558 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
559 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
560 \ |
2669 | 561 "psllq $2, %%mm3 \n\t"\ |
562 "psllq $2, %%mm4 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
563 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
564 "por %%mm3, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
565 "por %%mm4, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
566 \ |
9414 | 567 MOVNTQ(%%mm2, (dst, index, 2))\ |
568 MOVNTQ(%%mm1, 8(dst, index, 2))\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
569 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
570 "add $8, "#index" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
571 "cmp "#dstw", "#index" \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
572 " jb 1b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
573 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index) |
2669 | 574 |
9414 | 575 #define WRITEBGR24OLD(dst, dstw, index) \ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
576 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
577 "movq %%mm2, %%mm1 \n\t" /* B */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
578 "movq %%mm5, %%mm6 \n\t" /* R */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
579 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
580 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
581 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
582 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
583 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
584 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ |
2326 | 585 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ |
586 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
587 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
588 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
589 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
590 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
591 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ |
4248 | 592 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\ |
593 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
594 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
595 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
596 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
597 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
598 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
599 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
600 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
601 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
602 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ |
4248 | 603 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
604 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
605 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ |
4248 | 606 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\ |
607 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
608 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
609 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
610 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
611 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
612 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
613 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
614 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
615 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ |
4248 | 616 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\ |
617 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
618 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
619 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
620 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
621 \ |
9414 | 622 MOVNTQ(%%mm0, (dst))\ |
623 MOVNTQ(%%mm2, 8(dst))\ | |
624 MOVNTQ(%%mm3, 16(dst))\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
625 "add $24, "#dst" \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
626 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
627 "add $8, "#index" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
628 "cmp "#dstw", "#index" \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
629 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
630 |
9414 | 631 #define WRITEBGR24MMX(dst, dstw, index) \ |
2730 | 632 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
633 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
634 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
635 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
636 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
637 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
638 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
639 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
640 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
641 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
642 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
643 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
644 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
645 \ | |
646 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
647 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ | |
648 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ | |
649 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ | |
650 \ | |
651 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ | |
652 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ | |
653 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ | |
654 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ | |
655 \ | |
656 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ | |
657 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ | |
658 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ | |
659 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ | |
660 \ | |
661 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
662 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ | |
663 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ | |
664 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
9414 | 665 MOVNTQ(%%mm0, (dst))\ |
2730 | 666 \ |
667 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ | |
668 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ | |
669 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ | |
670 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ | |
9414 | 671 MOVNTQ(%%mm6, 8(dst))\ |
2730 | 672 \ |
673 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ | |
674 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
675 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ | |
9414 | 676 MOVNTQ(%%mm5, 16(dst))\ |
2730 | 677 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
678 "add $24, "#dst" \n\t"\ |
2730 | 679 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
680 "add $8, "#index" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
681 "cmp "#dstw", "#index" \n\t"\ |
2730 | 682 " jb 1b \n\t" |
683 | |
9414 | 684 #define WRITEBGR24MMX2(dst, dstw, index) \ |
2730 | 685 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
4248 | 686 "movq "MANGLE(M24A)", %%mm0 \n\t"\ |
687 "movq "MANGLE(M24C)", %%mm7 \n\t"\ | |
2730 | 688 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ |
689 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ | |
690 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ | |
691 \ | |
692 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ | |
693 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ | |
694 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ | |
695 \ | |
696 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ | |
697 "por %%mm1, %%mm6 \n\t"\ | |
698 "por %%mm3, %%mm6 \n\t"\ | |
9414 | 699 MOVNTQ(%%mm6, (dst))\ |
2730 | 700 \ |
701 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ | |
702 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ | |
703 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ | |
704 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ | |
705 \ | |
4248 | 706 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ |
2730 | 707 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ |
708 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ | |
709 \ | |
710 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ | |
711 "por %%mm3, %%mm6 \n\t"\ | |
9414 | 712 MOVNTQ(%%mm6, 8(dst))\ |
2730 | 713 \ |
714 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ | |
715 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ | |
716 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ | |
717 \ | |
718 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ | |
719 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ | |
4248 | 720 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ |
2730 | 721 \ |
722 "por %%mm1, %%mm3 \n\t"\ | |
723 "por %%mm3, %%mm6 \n\t"\ | |
9414 | 724 MOVNTQ(%%mm6, 16(dst))\ |
2730 | 725 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
726 "add $24, "#dst" \n\t"\ |
2730 | 727 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
728 "add $8, "#index" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
729 "cmp "#dstw", "#index" \n\t"\ |
2730 | 730 " jb 1b \n\t" |
731 | |
732 #ifdef HAVE_MMX2 | |
3126 | 733 #undef WRITEBGR24 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
734 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index) |
2730 | 735 #else |
3126 | 736 #undef WRITEBGR24 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
737 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) |
2730 | 738 #endif |
739 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
740 #define REAL_WRITEYUY2(dst, dstw, index) \ |
7723 | 741 "packuswb %%mm3, %%mm3 \n\t"\ |
742 "packuswb %%mm4, %%mm4 \n\t"\ | |
743 "packuswb %%mm7, %%mm1 \n\t"\ | |
744 "punpcklbw %%mm4, %%mm3 \n\t"\ | |
745 "movq %%mm1, %%mm7 \n\t"\ | |
746 "punpcklbw %%mm3, %%mm1 \n\t"\ | |
747 "punpckhbw %%mm3, %%mm7 \n\t"\ | |
748 \ | |
9414 | 749 MOVNTQ(%%mm1, (dst, index, 2))\ |
750 MOVNTQ(%%mm7, 8(dst, index, 2))\ | |
7723 | 751 \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
752 "add $8, "#index" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
753 "cmp "#dstw", "#index" \n\t"\ |
7723 | 754 " jb 1b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
755 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) |
7723 | 756 |
757 | |
9413 | 758 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
3344 | 759 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
760 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) |
2519 | 761 { |
3344 | 762 #ifdef HAVE_MMX |
763 if(uDest != NULL) | |
764 { | |
765 asm volatile( | |
9413 | 766 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET) |
767 :: "r" (&c->redDither), | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
768 "r" (uDest), "p" (chrDstW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
769 : "%"REG_a, "%"REG_d, "%"REG_S |
3344 | 770 ); |
2519 | 771 |
3344 | 772 asm volatile( |
9413 | 773 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET) |
774 :: "r" (&c->redDither), | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
775 "r" (vDest), "p" (chrDstW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
776 : "%"REG_a, "%"REG_d, "%"REG_S |
3344 | 777 ); |
778 } | |
2521 | 779 |
3344 | 780 asm volatile( |
9413 | 781 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET) |
782 :: "r" (&c->redDither), | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
783 "r" (dest), "p" (dstW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
784 : "%"REG_a, "%"REG_d, "%"REG_S |
3344 | 785 ); |
786 #else | |
12017
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
787 #ifdef HAVE_ALTIVEC |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
788 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
789 chrFilter, chrSrc, chrFilterSize, |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
790 dest, uDest, vDest, dstW, chrDstW); |
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
791 #else //HAVE_ALTIVEC |
6540 | 792 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, |
3352 | 793 chrFilter, chrSrc, chrFilterSize, |
6540 | 794 dest, uDest, vDest, dstW, chrDstW); |
12017
21e5cb258a95
AltiVec support in postproc/ + altivec optimizations for yuv2yuvX patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
11122
diff
changeset
|
795 #endif //!HAVE_ALTIVEC |
3344 | 796 #endif |
797 } | |
798 | |
14715 | 799 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
800 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
801 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat) | |
802 { | |
803 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, | |
804 chrFilter, chrSrc, chrFilterSize, | |
805 dest, uDest, dstW, chrDstW, dstFormat); | |
806 } | |
807 | |
3344 | 808 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc, |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
809 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) |
3344 | 810 { |
811 #ifdef HAVE_MMX | |
812 if(uDest != NULL) | |
813 { | |
814 asm volatile( | |
815 YSCALEYUV2YV121 | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
816 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW), |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
817 "g" (-chrDstW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
818 : "%"REG_a |
3344 | 819 ); |
820 | |
821 asm volatile( | |
822 YSCALEYUV2YV121 | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
823 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW), |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
824 "g" (-chrDstW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
825 : "%"REG_a |
3344 | 826 ); |
2519 | 827 } |
3344 | 828 |
829 asm volatile( | |
830 YSCALEYUV2YV121 | |
831 :: "r" (lumSrc + dstW), "r" (dest + dstW), | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
832 "g" (-dstW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
833 : "%"REG_a |
3344 | 834 ); |
835 #else | |
836 int i; | |
837 for(i=0; i<dstW; i++) | |
838 { | |
839 int val= lumSrc[i]>>7; | |
6503 | 840 |
841 if(val&256){ | |
842 if(val<0) val=0; | |
843 else val=255; | |
844 } | |
3344 | 845 |
6503 | 846 dest[i]= val; |
3344 | 847 } |
848 | |
849 if(uDest != NULL) | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
850 for(i=0; i<chrDstW; i++) |
3344 | 851 { |
852 int u=chrSrc[i]>>7; | |
853 int v=chrSrc[i + 2048]>>7; | |
854 | |
6503 | 855 if((u|v)&256){ |
856 if(u<0) u=0; | |
857 else if (u>255) u=255; | |
858 if(v<0) v=0; | |
859 else if (v>255) v=255; | |
860 } | |
861 | |
862 uDest[i]= u; | |
863 vDest[i]= v; | |
3344 | 864 } |
865 #endif | |
2519 | 866 } |
867 | |
3344 | 868 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
869 /** |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
870 * vertical scale YV12 to RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
871 */ |
7723 | 872 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
3344 | 873 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, |
9413 | 874 uint8_t *dest, int dstW, int dstY) |
3344 | 875 { |
9413 | 876 int dummy=0; |
6578 | 877 switch(c->dstFormat) |
3344 | 878 { |
879 #ifdef HAVE_MMX | |
6578 | 880 case IMGFMT_BGR32: |
3344 | 881 { |
882 asm volatile( | |
883 YSCALEYUV2RGBX | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
884 WRITEBGR32(%4, %5, %%REGa) |
3344 | 885 |
9413 | 886 :: "r" (&c->redDither), |
887 "m" (dummy), "m" (dummy), "m" (dummy), | |
888 "r" (dest), "m" (dstW) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
889 : "%"REG_a, "%"REG_d, "%"REG_S |
3344 | 890 ); |
891 } | |
6578 | 892 break; |
893 case IMGFMT_BGR24: | |
3344 | 894 { |
895 asm volatile( | |
896 YSCALEYUV2RGBX | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
897 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
898 "add %4, %%"REG_b" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
899 WRITEBGR24(%%REGb, %5, %%REGa) |
3344 | 900 |
9413 | 901 :: "r" (&c->redDither), |
902 "m" (dummy), "m" (dummy), "m" (dummy), | |
903 "r" (dest), "m" (dstW) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
904 : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx |
3344 | 905 ); |
906 } | |
6578 | 907 break; |
908 case IMGFMT_BGR15: | |
3344 | 909 { |
910 asm volatile( | |
911 YSCALEYUV2RGBX | |
912 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
913 #ifdef DITHER1XBPP | |
4248 | 914 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
915 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
916 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
3344 | 917 #endif |
918 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
919 WRITEBGR15(%4, %5, %%REGa) |
3344 | 920 |
9413 | 921 :: "r" (&c->redDither), |
922 "m" (dummy), "m" (dummy), "m" (dummy), | |
923 "r" (dest), "m" (dstW) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
924 : "%"REG_a, "%"REG_d, "%"REG_S |
3344 | 925 ); |
926 } | |
6578 | 927 break; |
928 case IMGFMT_BGR16: | |
3344 | 929 { |
930 asm volatile( | |
931 YSCALEYUV2RGBX | |
932 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
933 #ifdef DITHER1XBPP | |
4248 | 934 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
935 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
936 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
3344 | 937 #endif |
938 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
939 WRITEBGR16(%4, %5, %%REGa) |
3344 | 940 |
9413 | 941 :: "r" (&c->redDither), |
942 "m" (dummy), "m" (dummy), "m" (dummy), | |
943 "r" (dest), "m" (dstW) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
944 : "%"REG_a, "%"REG_d, "%"REG_S |
3344 | 945 ); |
946 } | |
6578 | 947 break; |
7723 | 948 case IMGFMT_YUY2: |
949 { | |
950 asm volatile( | |
951 YSCALEYUV2PACKEDX | |
952 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
953 | |
954 "psraw $3, %%mm3 \n\t" | |
955 "psraw $3, %%mm4 \n\t" | |
956 "psraw $3, %%mm1 \n\t" | |
957 "psraw $3, %%mm7 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
958 WRITEYUY2(%4, %5, %%REGa) |
7723 | 959 |
9413 | 960 :: "r" (&c->redDither), |
961 "m" (dummy), "m" (dummy), "m" (dummy), | |
962 "r" (dest), "m" (dstW) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
963 : "%"REG_a, "%"REG_d, "%"REG_S |
7723 | 964 ); |
965 } | |
966 break; | |
3344 | 967 #endif |
6578 | 968 default: |
12698 | 969 #ifdef HAVE_ALTIVEC |
17641
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
970 /* The following list of supported dstFormat values should |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
971 match what's found in the body of altivec_yuv2packedX() */ |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
972 if(c->dstFormat==IMGFMT_ABGR || c->dstFormat==IMGFMT_BGRA || |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
973 c->dstFormat==IMGFMT_BGR24 || c->dstFormat==IMGFMT_RGB24 || |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
974 c->dstFormat==IMGFMT_RGBA || c->dstFormat==IMGFMT_ARGB) |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
975 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize, |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
976 chrFilter, chrSrc, chrFilterSize, |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
977 dest, dstW, dstY); |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
978 else |
12698 | 979 #endif |
17641
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
980 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
981 chrFilter, chrSrc, chrFilterSize, |
fbf94ea858f1
don't call altivec_yuv2packedX() with a dstFormat that it doesn't support;
pacman
parents:
17367
diff
changeset
|
982 dest, dstW, dstY); |
6578 | 983 break; |
984 } | |
3344 | 985 } |
986 | |
987 /** | |
988 * vertical bilinear scale YV12 to RGB | |
989 */ | |
7723 | 990 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, |
6578 | 991 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
992 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
993 int yalpha1=yalpha^4095; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
994 int uvalpha1=uvalpha^4095; |
6578 | 995 int i; |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
996 |
11000 | 997 #if 0 //isn't used |
4467 | 998 if(flags&SWS_FULL_CHR_H_INT) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
999 { |
6578 | 1000 switch(dstFormat) |
1001 { | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1002 #ifdef HAVE_MMX |
6578 | 1003 case IMGFMT_BGR32: |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1004 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1005 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1006 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1007 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1008 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1009 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1010 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1011 "movq %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1012 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1013 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1014 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1015 MOVNTQ(%%mm3, (%4, %%REGa, 4)) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1016 MOVNTQ(%%mm1, 8(%4, %%REGa, 4)) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1017 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1018 "add $4, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1019 "cmp %5, %%"REG_a" \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1020 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1021 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1022 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1023 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1024 "m" (yalpha1), "m" (uvalpha1) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1025 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1026 ); |
6578 | 1027 break; |
1028 case IMGFMT_BGR24: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1029 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1030 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1031 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1032 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1033 // lsb ... msb |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1034 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1035 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1036 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1037 "movq %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1038 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1039 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1040 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1041 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1042 "psrlq $8, %%mm3 \n\t" // GR0BGR00 |
4248 | 1043 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000 |
1044 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00 | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1045 "por %%mm2, %%mm3 \n\t" // BGRBGR00 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1046 "movq %%mm1, %%mm2 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1047 "psllq $48, %%mm1 \n\t" // 000000BG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1048 "por %%mm1, %%mm3 \n\t" // BGRBGRBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1049 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1050 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1051 "psrld $16, %%mm2 \n\t" // R000R000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1052 "psrlq $24, %%mm1 \n\t" // 0BGR0000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1053 "por %%mm2, %%mm1 \n\t" // RBGRR000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1054 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1055 "mov %4, %%"REG_b" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1056 "add %%"REG_a", %%"REG_b" \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1057 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1058 #ifdef HAVE_MMX2 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1059 //FIXME Alignment |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1060 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1061 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1062 #else |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1063 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1064 "psrlq $32, %%mm3 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1065 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1066 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1067 #endif |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1068 "add $4, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1069 "cmp %5, %%"REG_a" \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1070 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1071 |
3209 | 1072 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1073 "m" (yalpha1), "m" (uvalpha1) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1074 : "%"REG_a, "%"REG_b |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1075 ); |
6578 | 1076 break; |
1077 case IMGFMT_BGR15: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1078 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1079 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1080 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1081 #ifdef DITHER1XBPP |
4248 | 1082 "paddusb "MANGLE(g5Dither)", %%mm1\n\t" |
1083 "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
1084 "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1085 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1086 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1087 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1088 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1089 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1090 "psrlw $3, %%mm3 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1091 "psllw $2, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1092 "psllw $7, %%mm0 \n\t" |
4248 | 1093 "pand "MANGLE(g15Mask)", %%mm1 \n\t" |
1094 "pand "MANGLE(r15Mask)", %%mm0 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1095 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1096 "por %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1097 "por %%mm1, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1098 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1099 MOVNTQ(%%mm0, (%4, %%REGa, 2)) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1100 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1101 "add $4, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1102 "cmp %5, %%"REG_a" \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1103 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1104 |
3209 | 1105 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1106 "m" (yalpha1), "m" (uvalpha1) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1107 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1108 ); |
6578 | 1109 break; |
1110 case IMGFMT_BGR16: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1111 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1112 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1113 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1114 #ifdef DITHER1XBPP |
4248 | 1115 "paddusb "MANGLE(g6Dither)", %%mm1\n\t" |
1116 "paddusb "MANGLE(r5Dither)", %%mm0\n\t" | |
1117 "paddusb "MANGLE(b5Dither)", %%mm3\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1118 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1119 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1120 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1121 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1122 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1123 "psrlw $3, %%mm3 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1124 "psllw $3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1125 "psllw $8, %%mm0 \n\t" |
4248 | 1126 "pand "MANGLE(g16Mask)", %%mm1 \n\t" |
1127 "pand "MANGLE(r16Mask)", %%mm0 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1128 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1129 "por %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1130 "por %%mm1, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1131 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1132 MOVNTQ(%%mm0, (%4, %%REGa, 2)) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1133 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1134 "add $4, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1135 "cmp %5, %%"REG_a" \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1136 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1137 |
3209 | 1138 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1139 "m" (yalpha1), "m" (uvalpha1) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1140 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1141 ); |
6578 | 1142 break; |
1143 #endif | |
1144 case IMGFMT_RGB32: | |
1145 #ifndef HAVE_MMX | |
1146 case IMGFMT_BGR32: | |
1147 #endif | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1148 if(dstFormat==IMGFMT_BGR32) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1149 { |
4794 | 1150 int i; |
4793 | 1151 #ifdef WORDS_BIGENDIAN |
1152 dest++; | |
1153 #endif | |
3209 | 1154 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1155 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1156 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1157 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1158 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
2503 | 1159 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; |
1160 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1161 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1162 dest+= 4; |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1163 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1164 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1165 else if(dstFormat==IMGFMT_BGR24) |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1166 { |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1167 int i; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1168 for(i=0;i<dstW;i++){ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1169 // vertical linear interpolation && yuv2rgb in a single step: |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1170 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1171 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1172 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1173 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1174 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1175 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1176 dest+= 3; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1177 } |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1178 } |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1179 else if(dstFormat==IMGFMT_BGR16) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1180 { |
2671 | 1181 int i; |
3209 | 1182 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1183 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1184 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1185 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1186 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1187 |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1188 ((uint16_t*)dest)[i] = |
2584 | 1189 clip_table16b[(Y + yuvtab_40cf[U]) >>13] | |
1190 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1191 clip_table16r[(Y + yuvtab_3343[V]) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1192 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1193 } |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
1194 else if(dstFormat==IMGFMT_BGR15) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1195 { |
2671 | 1196 int i; |
3209 | 1197 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1198 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1199 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1200 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1201 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1202 |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1203 ((uint16_t*)dest)[i] = |
2584 | 1204 clip_table15b[(Y + yuvtab_40cf[U]) >>13] | |
1205 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1206 clip_table15r[(Y + yuvtab_3343[V]) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1207 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1208 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1209 }//FULL_UV_IPOL |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1210 else |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1211 { |
6578 | 1212 #endif // if 0 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1213 #ifdef HAVE_MMX |
6578 | 1214 switch(c->dstFormat) |
1215 { | |
11000 | 1216 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( |
6578 | 1217 case IMGFMT_BGR32: |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1218 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1219 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1220 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1221 YSCALEYUV2RGB(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1222 WRITEBGR32(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1223 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1224 |
9414 | 1225 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), |
1226 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1227 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1228 ); |
6578 | 1229 return; |
1230 case IMGFMT_BGR24: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1231 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1232 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1233 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1234 YSCALEYUV2RGB(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1235 WRITEBGR24(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1236 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9414 | 1237 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), |
1238 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1239 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1240 ); |
6578 | 1241 return; |
1242 case IMGFMT_BGR15: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1243 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1244 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1245 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1246 YSCALEYUV2RGB(%%REGa, %5) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1247 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1248 #ifdef DITHER1XBPP |
4248 | 1249 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1250 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1251 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1252 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1253 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1254 WRITEBGR15(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1255 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1256 |
9414 | 1257 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), |
1258 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1259 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1260 ); |
6578 | 1261 return; |
1262 case IMGFMT_BGR16: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1263 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1264 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1265 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1266 YSCALEYUV2RGB(%%REGa, %5) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1267 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1268 #ifdef DITHER1XBPP |
4248 | 1269 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1270 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1271 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1272 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1273 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1274 WRITEBGR16(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1275 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9414 | 1276 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), |
1277 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1278 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1279 ); |
6578 | 1280 return; |
7723 | 1281 case IMGFMT_YUY2: |
1282 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1283 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1284 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1285 YSCALEYUV2PACKED(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1286 WRITEYUY2(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1287 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9414 | 1288 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), |
1289 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1290 : "%"REG_a |
7723 | 1291 ); |
1292 return; | |
6578 | 1293 default: break; |
1294 } | |
1295 #endif //HAVE_MMX | |
7723 | 1296 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1297 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1298 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1299 /** |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1300 * YV12 to RGB without scaling or interpolating |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1301 */ |
7723 | 1302 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, |
6578 | 1303 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1304 { |
3344 | 1305 const int yalpha1=0; |
6578 | 1306 int i; |
1307 | |
1308 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1 | |
1309 const int yalpha= 4096; //FIXME ... | |
2671 | 1310 |
4467 | 1311 if(flags&SWS_FULL_CHR_H_INT) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1312 { |
7723 | 1313 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y); |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1314 return; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1315 } |
2576 | 1316 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1317 #ifdef HAVE_MMX |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1318 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1319 { |
6578 | 1320 switch(dstFormat) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1321 { |
6578 | 1322 case IMGFMT_BGR32: |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1323 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1324 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1325 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1326 YSCALEYUV2RGB1(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1327 WRITEBGR32(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1328 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1329 |
1330 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1331 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1332 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1333 ); |
6578 | 1334 return; |
1335 case IMGFMT_BGR24: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1336 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1337 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1338 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1339 YSCALEYUV2RGB1(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1340 WRITEBGR24(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1341 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1342 |
1343 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1344 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1345 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1346 ); |
6578 | 1347 return; |
1348 case IMGFMT_BGR15: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1349 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1350 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1351 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1352 YSCALEYUV2RGB1(%%REGa, %5) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1353 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1354 #ifdef DITHER1XBPP |
4248 | 1355 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1356 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1357 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1358 #endif |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1359 WRITEBGR15(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1360 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1361 |
1362 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1363 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1364 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1365 ); |
6578 | 1366 return; |
1367 case IMGFMT_BGR16: | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1368 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1369 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1370 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1371 YSCALEYUV2RGB1(%%REGa, %5) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1372 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1373 #ifdef DITHER1XBPP |
4248 | 1374 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1375 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1376 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1377 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1378 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1379 WRITEBGR16(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1380 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1381 |
1382 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1383 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1384 : "%"REG_a |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1385 ); |
6578 | 1386 return; |
7723 | 1387 case IMGFMT_YUY2: |
1388 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1389 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1390 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1391 YSCALEYUV2PACKED1(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1392 WRITEYUY2(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1393 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1394 |
1395 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1396 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1397 : "%"REG_a |
7723 | 1398 ); |
1399 return; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1400 } |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1401 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1402 else |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1403 { |
6578 | 1404 switch(dstFormat) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1405 { |
6578 | 1406 case IMGFMT_BGR32: |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1407 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1408 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1409 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1410 YSCALEYUV2RGB1b(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1411 WRITEBGR32(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1412 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1413 |
1414 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1415 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1416 : "%"REG_a |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1417 ); |
6578 | 1418 return; |
1419 case IMGFMT_BGR24: | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1420 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1421 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1422 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1423 YSCALEYUV2RGB1b(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1424 WRITEBGR24(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1425 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1426 |
1427 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1428 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1429 : "%"REG_a |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1430 ); |
6578 | 1431 return; |
1432 case IMGFMT_BGR15: | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1433 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1434 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1435 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1436 YSCALEYUV2RGB1b(%%REGa, %5) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1437 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1438 #ifdef DITHER1XBPP |
4248 | 1439 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1440 "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | |
1441 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1442 #endif |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1443 WRITEBGR15(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1444 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1445 |
1446 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1447 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1448 : "%"REG_a |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1449 ); |
6578 | 1450 return; |
1451 case IMGFMT_BGR16: | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1452 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1453 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1454 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1455 YSCALEYUV2RGB1b(%%REGa, %5) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1456 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1457 #ifdef DITHER1XBPP |
4248 | 1458 "paddusb "MANGLE(b5Dither)", %%mm2\n\t" |
1459 "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | |
1460 "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1461 #endif |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1462 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1463 WRITEBGR16(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1464 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1465 |
1466 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1467 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1468 : "%"REG_a |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1469 ); |
6578 | 1470 return; |
7723 | 1471 case IMGFMT_YUY2: |
1472 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1473 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1474 "mov %4, %%"REG_SP" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1475 YSCALEYUV2PACKED1b(%%REGa, %5) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1476 WRITEYUY2(%%REGSP, 8280(%5), %%REGa) |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1477 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t" |
9417 | 1478 |
1479 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), | |
1480 "r" (&c->redDither) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1481 : "%"REG_a |
7723 | 1482 ); |
1483 return; | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1484 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1485 } |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1486 #endif |
6578 | 1487 if( uvalpha < 2048 ) |
1488 { | |
7723 | 1489 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C) |
6578 | 1490 }else{ |
7723 | 1491 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C) |
6578 | 1492 } |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1493 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1494 |
4481 | 1495 //FIXME yuy2* can read upto 7 samples to much |
1496 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1497 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width) |
4467 | 1498 { |
4481 | 1499 #ifdef HAVE_MMX |
1500 asm volatile( | |
1501 "movq "MANGLE(bm01010101)", %%mm2\n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1502 "mov %0, %%"REG_a" \n\t" |
4481 | 1503 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1504 "movq (%1, %%"REG_a",2), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1505 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" |
4481 | 1506 "pand %%mm2, %%mm0 \n\t" |
1507 "pand %%mm2, %%mm1 \n\t" | |
1508 "packuswb %%mm1, %%mm0 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1509 "movq %%mm0, (%2, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1510 "add $8, %%"REG_a" \n\t" |
4481 | 1511 " js 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1512 : : "g" (-width), "r" (src+width*2), "r" (dst+width) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1513 : "%"REG_a |
4481 | 1514 ); |
4467 | 1515 #else |
1516 int i; | |
1517 for(i=0; i<width; i++) | |
1518 dst[i]= src[2*i]; | |
1519 #endif | |
1520 } | |
1521 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1522 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
4467 | 1523 { |
4481 | 1524 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1525 asm volatile( | |
1526 "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1527 "mov %0, %%"REG_a" \n\t" |
4481 | 1528 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1529 "movq (%1, %%"REG_a",4), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1530 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1531 "movq (%2, %%"REG_a",4), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1532 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t" |
4481 | 1533 PAVGB(%%mm2, %%mm0) |
1534 PAVGB(%%mm3, %%mm1) | |
1535 "psrlw $8, %%mm0 \n\t" | |
1536 "psrlw $8, %%mm1 \n\t" | |
1537 "packuswb %%mm1, %%mm0 \n\t" | |
1538 "movq %%mm0, %%mm1 \n\t" | |
1539 "psrlw $8, %%mm0 \n\t" | |
1540 "pand %%mm4, %%mm1 \n\t" | |
1541 "packuswb %%mm0, %%mm0 \n\t" | |
1542 "packuswb %%mm1, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1543 "movd %%mm0, (%4, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1544 "movd %%mm1, (%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1545 "add $4, %%"REG_a" \n\t" |
4481 | 1546 " js 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1547 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1548 : "%"REG_a |
4481 | 1549 ); |
4467 | 1550 #else |
1551 int i; | |
1552 for(i=0; i<width; i++) | |
1553 { | |
1554 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1; | |
1555 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1; | |
1556 } | |
1557 #endif | |
1558 } | |
1559 | |
9071 | 1560 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1561 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width) |
9071 | 1562 { |
1563 #ifdef HAVE_MMX | |
1564 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1565 "mov %0, %%"REG_a" \n\t" |
9071 | 1566 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1567 "movq (%1, %%"REG_a",2), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1568 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" |
9071 | 1569 "psrlw $8, %%mm0 \n\t" |
1570 "psrlw $8, %%mm1 \n\t" | |
1571 "packuswb %%mm1, %%mm0 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1572 "movq %%mm0, (%2, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1573 "add $8, %%"REG_a" \n\t" |
9071 | 1574 " js 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1575 : : "g" (-width), "r" (src+width*2), "r" (dst+width) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1576 : "%"REG_a |
9071 | 1577 ); |
1578 #else | |
1579 int i; | |
1580 for(i=0; i<width; i++) | |
1581 dst[i]= src[2*i+1]; | |
1582 #endif | |
1583 } | |
1584 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1585 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
9071 | 1586 { |
1587 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1588 asm volatile( | |
1589 "movq "MANGLE(bm01010101)", %%mm4\n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1590 "mov %0, %%"REG_a" \n\t" |
9071 | 1591 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1592 "movq (%1, %%"REG_a",4), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1593 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1594 "movq (%2, %%"REG_a",4), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1595 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t" |
9071 | 1596 PAVGB(%%mm2, %%mm0) |
1597 PAVGB(%%mm3, %%mm1) | |
1598 "pand %%mm4, %%mm0 \n\t" | |
1599 "pand %%mm4, %%mm1 \n\t" | |
1600 "packuswb %%mm1, %%mm0 \n\t" | |
1601 "movq %%mm0, %%mm1 \n\t" | |
1602 "psrlw $8, %%mm0 \n\t" | |
1603 "pand %%mm4, %%mm1 \n\t" | |
1604 "packuswb %%mm0, %%mm0 \n\t" | |
1605 "packuswb %%mm1, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1606 "movd %%mm0, (%4, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1607 "movd %%mm1, (%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1608 "add $4, %%"REG_a" \n\t" |
9071 | 1609 " js 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1610 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1611 : "%"REG_a |
9071 | 1612 ); |
1613 #else | |
1614 int i; | |
1615 for(i=0; i<width; i++) | |
1616 { | |
1617 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1; | |
1618 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1; | |
1619 } | |
1620 #endif | |
1621 } | |
1622 | |
4467 | 1623 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width) |
1624 { | |
1625 int i; | |
1626 for(i=0; i<width; i++) | |
1627 { | |
9433 | 1628 int b= ((uint32_t*)src)[i]&0xFF; |
1629 int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
9499 | 1630 int r= (((uint32_t*)src)[i]>>16)&0xFF; |
4467 | 1631 |
9433 | 1632 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); |
4467 | 1633 } |
1634 } | |
1635 | |
1636 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1637 { | |
1638 int i; | |
1639 for(i=0; i<width; i++) | |
1640 { | |
9433 | 1641 const int a= ((uint32_t*)src1)[2*i+0]; |
1642 const int e= ((uint32_t*)src1)[2*i+1]; | |
1643 const int c= ((uint32_t*)src2)[2*i+0]; | |
1644 const int d= ((uint32_t*)src2)[2*i+1]; | |
1645 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF); | |
1646 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00); | |
1647 const int b= l&0x3FF; | |
1648 const int g= h>>8; | |
1649 const int r= l>>16; | |
4467 | 1650 |
1651 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1652 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1653 } | |
1654 } | |
1655 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1656 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width) |
4467 | 1657 { |
4612 | 1658 #ifdef HAVE_MMX |
1659 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1660 "mov %2, %%"REG_a" \n\t" |
4923 | 1661 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" |
1662 "movq "MANGLE(w1111)", %%mm5 \n\t" | |
4612 | 1663 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1664 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" |
4612 | 1665 ".balign 16 \n\t" |
1666 "1: \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1667 PREFETCH" 64(%0, %%"REG_b") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1668 "movd (%0, %%"REG_b"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1669 "movd 3(%0, %%"REG_b"), %%mm1 \n\t" |
4612 | 1670 "punpcklbw %%mm7, %%mm0 \n\t" |
1671 "punpcklbw %%mm7, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1672 "movd 6(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1673 "movd 9(%0, %%"REG_b"), %%mm3 \n\t" |
4612 | 1674 "punpcklbw %%mm7, %%mm2 \n\t" |
1675 "punpcklbw %%mm7, %%mm3 \n\t" | |
1676 "pmaddwd %%mm6, %%mm0 \n\t" | |
1677 "pmaddwd %%mm6, %%mm1 \n\t" | |
1678 "pmaddwd %%mm6, %%mm2 \n\t" | |
1679 "pmaddwd %%mm6, %%mm3 \n\t" | |
1680 #ifndef FAST_BGR2YV12 | |
1681 "psrad $8, %%mm0 \n\t" | |
1682 "psrad $8, %%mm1 \n\t" | |
1683 "psrad $8, %%mm2 \n\t" | |
1684 "psrad $8, %%mm3 \n\t" | |
1685 #endif | |
1686 "packssdw %%mm1, %%mm0 \n\t" | |
1687 "packssdw %%mm3, %%mm2 \n\t" | |
1688 "pmaddwd %%mm5, %%mm0 \n\t" | |
1689 "pmaddwd %%mm5, %%mm2 \n\t" | |
1690 "packssdw %%mm2, %%mm0 \n\t" | |
1691 "psraw $7, %%mm0 \n\t" | |
1692 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1693 "movd 12(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1694 "movd 15(%0, %%"REG_b"), %%mm1 \n\t" |
4612 | 1695 "punpcklbw %%mm7, %%mm4 \n\t" |
1696 "punpcklbw %%mm7, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1697 "movd 18(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1698 "movd 21(%0, %%"REG_b"), %%mm3 \n\t" |
4612 | 1699 "punpcklbw %%mm7, %%mm2 \n\t" |
1700 "punpcklbw %%mm7, %%mm3 \n\t" | |
1701 "pmaddwd %%mm6, %%mm4 \n\t" | |
1702 "pmaddwd %%mm6, %%mm1 \n\t" | |
1703 "pmaddwd %%mm6, %%mm2 \n\t" | |
1704 "pmaddwd %%mm6, %%mm3 \n\t" | |
1705 #ifndef FAST_BGR2YV12 | |
1706 "psrad $8, %%mm4 \n\t" | |
1707 "psrad $8, %%mm1 \n\t" | |
1708 "psrad $8, %%mm2 \n\t" | |
1709 "psrad $8, %%mm3 \n\t" | |
1710 #endif | |
1711 "packssdw %%mm1, %%mm4 \n\t" | |
1712 "packssdw %%mm3, %%mm2 \n\t" | |
1713 "pmaddwd %%mm5, %%mm4 \n\t" | |
1714 "pmaddwd %%mm5, %%mm2 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1715 "add $24, %%"REG_b" \n\t" |
4612 | 1716 "packssdw %%mm2, %%mm4 \n\t" |
1717 "psraw $7, %%mm4 \n\t" | |
1718 | |
1719 "packuswb %%mm4, %%mm0 \n\t" | |
4923 | 1720 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" |
4612 | 1721 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1722 "movq %%mm0, (%1, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1723 "add $8, %%"REG_a" \n\t" |
4612 | 1724 " js 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1725 : : "r" (src+width*3), "r" (dst+width), "g" (-width) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1726 : "%"REG_a, "%"REG_b |
4612 | 1727 ); |
4467 | 1728 #else |
1729 int i; | |
1730 for(i=0; i<width; i++) | |
1731 { | |
1732 int b= src[i*3+0]; | |
1733 int g= src[i*3+1]; | |
1734 int r= src[i*3+2]; | |
1735 | |
9434 | 1736 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); |
4467 | 1737 } |
1738 #endif | |
1739 } | |
1740 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1741 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) |
4467 | 1742 { |
4619 | 1743 #ifdef HAVE_MMX |
1744 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1745 "mov %4, %%"REG_a" \n\t" |
4923 | 1746 "movq "MANGLE(w1111)", %%mm5 \n\t" |
1747 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" | |
4619 | 1748 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1749 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1750 "add %%"REG_b", %%"REG_b" \n\t" |
4619 | 1751 ".balign 16 \n\t" |
1752 "1: \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1753 PREFETCH" 64(%0, %%"REG_b") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1754 PREFETCH" 64(%1, %%"REG_b") \n\t" |
4619 | 1755 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1756 "movq (%0, %%"REG_b"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1757 "movq (%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1758 "movq 6(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1759 "movq 6(%1, %%"REG_b"), %%mm3 \n\t" |
4619 | 1760 PAVGB(%%mm1, %%mm0) |
1761 PAVGB(%%mm3, %%mm2) | |
1762 "movq %%mm0, %%mm1 \n\t" | |
1763 "movq %%mm2, %%mm3 \n\t" | |
1764 "psrlq $24, %%mm0 \n\t" | |
1765 "psrlq $24, %%mm2 \n\t" | |
1766 PAVGB(%%mm1, %%mm0) | |
1767 PAVGB(%%mm3, %%mm2) | |
1768 "punpcklbw %%mm7, %%mm0 \n\t" | |
1769 "punpcklbw %%mm7, %%mm2 \n\t" | |
1770 #else | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1771 "movd (%0, %%"REG_b"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1772 "movd (%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1773 "movd 3(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1774 "movd 3(%1, %%"REG_b"), %%mm3 \n\t" |
4619 | 1775 "punpcklbw %%mm7, %%mm0 \n\t" |
1776 "punpcklbw %%mm7, %%mm1 \n\t" | |
1777 "punpcklbw %%mm7, %%mm2 \n\t" | |
1778 "punpcklbw %%mm7, %%mm3 \n\t" | |
1779 "paddw %%mm1, %%mm0 \n\t" | |
1780 "paddw %%mm3, %%mm2 \n\t" | |
1781 "paddw %%mm2, %%mm0 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1782 "movd 6(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1783 "movd 6(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1784 "movd 9(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1785 "movd 9(%1, %%"REG_b"), %%mm3 \n\t" |
4619 | 1786 "punpcklbw %%mm7, %%mm4 \n\t" |
1787 "punpcklbw %%mm7, %%mm1 \n\t" | |
1788 "punpcklbw %%mm7, %%mm2 \n\t" | |
1789 "punpcklbw %%mm7, %%mm3 \n\t" | |
1790 "paddw %%mm1, %%mm4 \n\t" | |
1791 "paddw %%mm3, %%mm2 \n\t" | |
1792 "paddw %%mm4, %%mm2 \n\t" | |
1793 "psrlw $2, %%mm0 \n\t" | |
1794 "psrlw $2, %%mm2 \n\t" | |
1795 #endif | |
4923 | 1796 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
1797 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4619 | 1798 |
1799 "pmaddwd %%mm0, %%mm1 \n\t" | |
1800 "pmaddwd %%mm2, %%mm3 \n\t" | |
1801 "pmaddwd %%mm6, %%mm0 \n\t" | |
1802 "pmaddwd %%mm6, %%mm2 \n\t" | |
1803 #ifndef FAST_BGR2YV12 | |
1804 "psrad $8, %%mm0 \n\t" | |
1805 "psrad $8, %%mm1 \n\t" | |
1806 "psrad $8, %%mm2 \n\t" | |
1807 "psrad $8, %%mm3 \n\t" | |
1808 #endif | |
1809 "packssdw %%mm2, %%mm0 \n\t" | |
1810 "packssdw %%mm3, %%mm1 \n\t" | |
1811 "pmaddwd %%mm5, %%mm0 \n\t" | |
1812 "pmaddwd %%mm5, %%mm1 \n\t" | |
1813 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | |
1814 "psraw $7, %%mm0 \n\t" | |
1815 | |
1816 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1817 "movq 12(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1818 "movq 12(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1819 "movq 18(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1820 "movq 18(%1, %%"REG_b"), %%mm3 \n\t" |
4619 | 1821 PAVGB(%%mm1, %%mm4) |
1822 PAVGB(%%mm3, %%mm2) | |
1823 "movq %%mm4, %%mm1 \n\t" | |
1824 "movq %%mm2, %%mm3 \n\t" | |
1825 "psrlq $24, %%mm4 \n\t" | |
1826 "psrlq $24, %%mm2 \n\t" | |
1827 PAVGB(%%mm1, %%mm4) | |
1828 PAVGB(%%mm3, %%mm2) | |
1829 "punpcklbw %%mm7, %%mm4 \n\t" | |
1830 "punpcklbw %%mm7, %%mm2 \n\t" | |
1831 #else | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1832 "movd 12(%0, %%"REG_b"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1833 "movd 12(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1834 "movd 15(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1835 "movd 15(%1, %%"REG_b"), %%mm3 \n\t" |
4619 | 1836 "punpcklbw %%mm7, %%mm4 \n\t" |
1837 "punpcklbw %%mm7, %%mm1 \n\t" | |
1838 "punpcklbw %%mm7, %%mm2 \n\t" | |
1839 "punpcklbw %%mm7, %%mm3 \n\t" | |
1840 "paddw %%mm1, %%mm4 \n\t" | |
1841 "paddw %%mm3, %%mm2 \n\t" | |
1842 "paddw %%mm2, %%mm4 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1843 "movd 18(%0, %%"REG_b"), %%mm5 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1844 "movd 18(%1, %%"REG_b"), %%mm1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1845 "movd 21(%0, %%"REG_b"), %%mm2 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1846 "movd 21(%1, %%"REG_b"), %%mm3 \n\t" |
4619 | 1847 "punpcklbw %%mm7, %%mm5 \n\t" |
1848 "punpcklbw %%mm7, %%mm1 \n\t" | |
1849 "punpcklbw %%mm7, %%mm2 \n\t" | |
1850 "punpcklbw %%mm7, %%mm3 \n\t" | |
1851 "paddw %%mm1, %%mm5 \n\t" | |
1852 "paddw %%mm3, %%mm2 \n\t" | |
1853 "paddw %%mm5, %%mm2 \n\t" | |
4923 | 1854 "movq "MANGLE(w1111)", %%mm5 \n\t" |
4619 | 1855 "psrlw $2, %%mm4 \n\t" |
1856 "psrlw $2, %%mm2 \n\t" | |
1857 #endif | |
4923 | 1858 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" |
1859 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" | |
4619 | 1860 |
1861 "pmaddwd %%mm4, %%mm1 \n\t" | |
1862 "pmaddwd %%mm2, %%mm3 \n\t" | |
1863 "pmaddwd %%mm6, %%mm4 \n\t" | |
1864 "pmaddwd %%mm6, %%mm2 \n\t" | |
1865 #ifndef FAST_BGR2YV12 | |
1866 "psrad $8, %%mm4 \n\t" | |
1867 "psrad $8, %%mm1 \n\t" | |
1868 "psrad $8, %%mm2 \n\t" | |
1869 "psrad $8, %%mm3 \n\t" | |
1870 #endif | |
1871 "packssdw %%mm2, %%mm4 \n\t" | |
1872 "packssdw %%mm3, %%mm1 \n\t" | |
1873 "pmaddwd %%mm5, %%mm4 \n\t" | |
1874 "pmaddwd %%mm5, %%mm1 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1875 "add $24, %%"REG_b" \n\t" |
4619 | 1876 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 |
1877 "psraw $7, %%mm4 \n\t" | |
1878 | |
1879 "movq %%mm0, %%mm1 \n\t" | |
1880 "punpckldq %%mm4, %%mm0 \n\t" | |
1881 "punpckhdq %%mm4, %%mm1 \n\t" | |
1882 "packsswb %%mm1, %%mm0 \n\t" | |
4923 | 1883 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" |
4619 | 1884 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1885 "movd %%mm0, (%2, %%"REG_a") \n\t" |
4619 | 1886 "punpckhdq %%mm0, %%mm0 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1887 "movd %%mm0, (%3, %%"REG_a") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1888 "add $4, %%"REG_a" \n\t" |
4619 | 1889 " js 1b \n\t" |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
1890 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
1891 : "%"REG_a, "%"REG_b |
4619 | 1892 ); |
4467 | 1893 #else |
1894 int i; | |
1895 for(i=0; i<width; i++) | |
1896 { | |
1897 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3]; | |
1898 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4]; | |
1899 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5]; | |
1900 | |
1901 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1902 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
1903 } | |
1904 #endif | |
1905 } | |
1906 | |
4578 | 1907 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width) |
1908 { | |
1909 int i; | |
1910 for(i=0; i<width; i++) | |
1911 { | |
9433 | 1912 int d= ((uint16_t*)src)[i]; |
4578 | 1913 int b= d&0x1F; |
1914 int g= (d>>5)&0x3F; | |
1915 int r= (d>>11)&0x1F; | |
1916 | |
1917 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16; | |
1918 } | |
1919 } | |
1920 | |
1921 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1922 { | |
1923 int i; | |
1924 for(i=0; i<width; i++) | |
1925 { | |
9433 | 1926 int d0= ((uint32_t*)src1)[i]; |
1927 int d1= ((uint32_t*)src2)[i]; | |
4579 | 1928 |
1929 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F); | |
1930 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F); | |
1931 | |
1932 int dh2= (dh>>11) + (dh<<21); | |
1933 int d= dh2 + dl; | |
1934 | |
1935 int b= d&0x7F; | |
1936 int r= (d>>11)&0x7F; | |
1937 int g= d>>21; | |
4578 | 1938 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128; |
1939 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128; | |
1940 } | |
1941 } | |
1942 | |
4580 | 1943 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width) |
1944 { | |
1945 int i; | |
1946 for(i=0; i<width; i++) | |
1947 { | |
9433 | 1948 int d= ((uint16_t*)src)[i]; |
4580 | 1949 int b= d&0x1F; |
1950 int g= (d>>5)&0x1F; | |
1951 int r= (d>>10)&0x1F; | |
1952 | |
1953 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16; | |
1954 } | |
1955 } | |
1956 | |
1957 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1958 { | |
1959 int i; | |
1960 for(i=0; i<width; i++) | |
1961 { | |
9433 | 1962 int d0= ((uint32_t*)src1)[i]; |
1963 int d1= ((uint32_t*)src2)[i]; | |
4580 | 1964 |
1965 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F); | |
1966 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F); | |
1967 | |
1968 int dh2= (dh>>11) + (dh<<21); | |
1969 int d= dh2 + dl; | |
1970 | |
1971 int b= d&0x7F; | |
1972 int r= (d>>10)&0x7F; | |
1973 int g= d>>21; | |
1974 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128; | |
1975 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128; | |
1976 } | |
1977 } | |
1978 | |
1979 | |
4558 | 1980 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width) |
1981 { | |
1982 int i; | |
1983 for(i=0; i<width; i++) | |
1984 { | |
9433 | 1985 int r= ((uint32_t*)src)[i]&0xFF; |
1986 int g= (((uint32_t*)src)[i]>>8)&0xFF; | |
9499 | 1987 int b= (((uint32_t*)src)[i]>>16)&0xFF; |
4558 | 1988 |
9433 | 1989 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); |
4558 | 1990 } |
1991 } | |
1992 | |
1993 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
1994 { | |
1995 int i; | |
1996 for(i=0; i<width; i++) | |
1997 { | |
9433 | 1998 const int a= ((uint32_t*)src1)[2*i+0]; |
1999 const int e= ((uint32_t*)src1)[2*i+1]; | |
2000 const int c= ((uint32_t*)src2)[2*i+0]; | |
2001 const int d= ((uint32_t*)src2)[2*i+1]; | |
2002 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF); | |
2003 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00); | |
2004 const int r= l&0x3FF; | |
2005 const int g= h>>8; | |
2006 const int b= l>>16; | |
4558 | 2007 |
2008 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2009 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2010 } | |
2011 } | |
2012 | |
2013 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width) | |
2014 { | |
2015 int i; | |
2016 for(i=0; i<width; i++) | |
2017 { | |
2018 int r= src[i*3+0]; | |
2019 int g= src[i*3+1]; | |
2020 int b= src[i*3+2]; | |
2021 | |
9433 | 2022 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT); |
4558 | 2023 } |
2024 } | |
2025 | |
2026 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | |
2027 { | |
2028 int i; | |
2029 for(i=0; i<width; i++) | |
2030 { | |
2031 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3]; | |
2032 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4]; | |
2033 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5]; | |
2034 | |
2035 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2036 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128; | |
2037 } | |
2038 } | |
2039 | |
4467 | 2040 |
3272 | 2041 // Bilinear / Bicubic scaling |
2042 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
2043 int16_t *filter, int16_t *filterPos, long filterSize) |
3272 | 2044 { |
2045 #ifdef HAVE_MMX | |
9921
61057de81510
mplayer idependant (not really yet) swscale example
michael
parents:
9499
diff
changeset
|
2046 assert(filterSize % 4 == 0 && filterSize>0); |
3272 | 2047 if(filterSize==4) // allways true for upscaling, sometimes for down too |
2048 { | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2049 long counter= -2*dstW; |
3272 | 2050 filter-= counter*2; |
2051 filterPos-= counter/2; | |
2052 dst-= counter/2; | |
2053 asm volatile( | |
2054 "pxor %%mm7, %%mm7 \n\t" | |
4248 | 2055 "movq "MANGLE(w02)", %%mm6 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2056 "push %%"REG_BP" \n\t" // we use 7 regs here ... |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2057 "mov %%"REG_a", %%"REG_BP" \n\t" |
3272 | 2058 ".balign 16 \n\t" |
2059 "1: \n\t" | |
13733
c45cf718dfe8
10000l : fix a crash on x86 due to an horrible mistake in my x86_64 patch
aurel
parents:
13720
diff
changeset
|
2060 "movzwl (%2, %%"REG_BP"), %%eax \n\t" |
c45cf718dfe8
10000l : fix a crash on x86 due to an horrible mistake in my x86_64 patch
aurel
parents:
13720
diff
changeset
|
2061 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2062 "movq (%1, %%"REG_BP", 4), %%mm1\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2063 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2064 "movd (%3, %%"REG_a"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2065 "movd (%3, %%"REG_b"), %%mm2 \n\t" |
3272 | 2066 "punpcklbw %%mm7, %%mm0 \n\t" |
2067 "punpcklbw %%mm7, %%mm2 \n\t" | |
2068 "pmaddwd %%mm1, %%mm0 \n\t" | |
2069 "pmaddwd %%mm2, %%mm3 \n\t" | |
2070 "psrad $8, %%mm0 \n\t" | |
2071 "psrad $8, %%mm3 \n\t" | |
2072 "packssdw %%mm3, %%mm0 \n\t" | |
2073 "pmaddwd %%mm6, %%mm0 \n\t" | |
2074 "packssdw %%mm0, %%mm0 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2075 "movd %%mm0, (%4, %%"REG_BP") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2076 "add $4, %%"REG_BP" \n\t" |
3272 | 2077 " jnc 1b \n\t" |
3352 | 2078 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2079 "pop %%"REG_BP" \n\t" |
3272 | 2080 : "+a" (counter) |
2081 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2082 : "%"REG_b |
3272 | 2083 ); |
2084 } | |
2085 else if(filterSize==8) | |
2086 { | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2087 long counter= -2*dstW; |
3272 | 2088 filter-= counter*4; |
2089 filterPos-= counter/2; | |
2090 dst-= counter/2; | |
2091 asm volatile( | |
2092 "pxor %%mm7, %%mm7 \n\t" | |
4248 | 2093 "movq "MANGLE(w02)", %%mm6 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2094 "push %%"REG_BP" \n\t" // we use 7 regs here ... |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2095 "mov %%"REG_a", %%"REG_BP" \n\t" |
3272 | 2096 ".balign 16 \n\t" |
2097 "1: \n\t" | |
13733
c45cf718dfe8
10000l : fix a crash on x86 due to an horrible mistake in my x86_64 patch
aurel
parents:
13720
diff
changeset
|
2098 "movzwl (%2, %%"REG_BP"), %%eax \n\t" |
c45cf718dfe8
10000l : fix a crash on x86 due to an horrible mistake in my x86_64 patch
aurel
parents:
13720
diff
changeset
|
2099 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2100 "movq (%1, %%"REG_BP", 8), %%mm1\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2101 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2102 "movd (%3, %%"REG_a"), %%mm0 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2103 "movd (%3, %%"REG_b"), %%mm2 \n\t" |
3272 | 2104 "punpcklbw %%mm7, %%mm0 \n\t" |
2105 "punpcklbw %%mm7, %%mm2 \n\t" | |
2106 "pmaddwd %%mm1, %%mm0 \n\t" | |
2107 "pmaddwd %%mm2, %%mm3 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
2108 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2109 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2110 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2111 "movd 4(%3, %%"REG_a"), %%mm4 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2112 "movd 4(%3, %%"REG_b"), %%mm2 \n\t" |
3272 | 2113 "punpcklbw %%mm7, %%mm4 \n\t" |
2114 "punpcklbw %%mm7, %%mm2 \n\t" | |
2115 "pmaddwd %%mm1, %%mm4 \n\t" | |
2116 "pmaddwd %%mm2, %%mm5 \n\t" | |
2117 "paddd %%mm4, %%mm0 \n\t" | |
2118 "paddd %%mm5, %%mm3 \n\t" | |
2119 | |
2120 "psrad $8, %%mm0 \n\t" | |
2121 "psrad $8, %%mm3 \n\t" | |
2122 "packssdw %%mm3, %%mm0 \n\t" | |
2123 "pmaddwd %%mm6, %%mm0 \n\t" | |
2124 "packssdw %%mm0, %%mm0 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2125 "movd %%mm0, (%4, %%"REG_BP") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2126 "add $4, %%"REG_BP" \n\t" |
3272 | 2127 " jnc 1b \n\t" |
3344 | 2128 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2129 "pop %%"REG_BP" \n\t" |
3272 | 2130 : "+a" (counter) |
2131 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2132 : "%"REG_b |
3272 | 2133 ); |
2134 } | |
2135 else | |
2136 { | |
15617
130dd060f723
one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents:
15295
diff
changeset
|
2137 uint8_t *offset = src+filterSize; |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2138 long counter= -2*dstW; |
3272 | 2139 // filter-= counter*filterSize/2; |
2140 filterPos-= counter/2; | |
2141 dst-= counter/2; | |
2142 asm volatile( | |
2143 "pxor %%mm7, %%mm7 \n\t" | |
4248 | 2144 "movq "MANGLE(w02)", %%mm6 \n\t" |
3272 | 2145 ".balign 16 \n\t" |
2146 "1: \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2147 "mov %2, %%"REG_c" \n\t" |
13733
c45cf718dfe8
10000l : fix a crash on x86 due to an horrible mistake in my x86_64 patch
aurel
parents:
13720
diff
changeset
|
2148 "movzwl (%%"REG_c", %0), %%eax \n\t" |
c45cf718dfe8
10000l : fix a crash on x86 due to an horrible mistake in my x86_64 patch
aurel
parents:
13720
diff
changeset
|
2149 "movzwl 2(%%"REG_c", %0), %%ebx \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2150 "mov %5, %%"REG_c" \n\t" |
3272 | 2151 "pxor %%mm4, %%mm4 \n\t" |
2152 "pxor %%mm5, %%mm5 \n\t" | |
2153 "2: \n\t" | |
2154 "movq (%1), %%mm1 \n\t" | |
2155 "movq (%1, %6), %%mm3 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2156 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2157 "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t" |
3272 | 2158 "punpcklbw %%mm7, %%mm0 \n\t" |
2159 "punpcklbw %%mm7, %%mm2 \n\t" | |
2160 "pmaddwd %%mm1, %%mm0 \n\t" | |
2161 "pmaddwd %%mm2, %%mm3 \n\t" | |
2162 "paddd %%mm3, %%mm5 \n\t" | |
2163 "paddd %%mm0, %%mm4 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2164 "add $8, %1 \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2165 "add $4, %%"REG_c" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2166 "cmp %4, %%"REG_c" \n\t" |
3272 | 2167 " jb 2b \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2168 "add %6, %1 \n\t" |
3272 | 2169 "psrad $8, %%mm4 \n\t" |
2170 "psrad $8, %%mm5 \n\t" | |
2171 "packssdw %%mm5, %%mm4 \n\t" | |
2172 "pmaddwd %%mm6, %%mm4 \n\t" | |
2173 "packssdw %%mm4, %%mm4 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2174 "mov %3, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2175 "movd %%mm4, (%%"REG_a", %0) \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2176 "add $4, %0 \n\t" |
3272 | 2177 " jnc 1b \n\t" |
3344 | 2178 |
3641 | 2179 : "+r" (counter), "+r" (filter) |
15617
130dd060f723
one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents:
15295
diff
changeset
|
2180 : "m" (filterPos), "m" (dst), "m"(offset), |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
2181 "m" (src), "r" (filterSize*2) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2182 : "%"REG_b, "%"REG_a, "%"REG_c |
3272 | 2183 ); |
2184 } | |
2185 #else | |
12130
2ef24558b732
AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michael
parents:
12017
diff
changeset
|
2186 #ifdef HAVE_ALTIVEC |
2ef24558b732
AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michael
parents:
12017
diff
changeset
|
2187 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize); |
2ef24558b732
AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michael
parents:
12017
diff
changeset
|
2188 #else |
3272 | 2189 int i; |
2190 for(i=0; i<dstW; i++) | |
2191 { | |
2192 int j; | |
2193 int srcPos= filterPos[i]; | |
2194 int val=0; | |
3344 | 2195 // printf("filterPos: %d\n", filterPos[i]); |
3272 | 2196 for(j=0; j<filterSize; j++) |
2197 { | |
2198 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); | |
2199 val += ((int)src[srcPos + j])*filter[filterSize*i + j]; | |
2200 } | |
2201 // filter += hFilterSize; | |
2202 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ... | |
2203 // dst[i] = val>>7; | |
2204 } | |
2205 #endif | |
12130
2ef24558b732
AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michael
parents:
12017
diff
changeset
|
2206 #endif |
3272 | 2207 } |
2208 // *** horizontal scale Y line to temp buffer | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2209 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc, |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2210 int flags, int canMMX2BeUsed, int16_t *hLumFilter, |
4467 | 2211 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, |
5452 | 2212 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, |
2213 int32_t *mmx2FilterPos) | |
2469 | 2214 { |
4467 | 2215 if(srcFormat==IMGFMT_YUY2) |
2216 { | |
2217 RENAME(yuy2ToY)(formatConvBuffer, src, srcW); | |
2218 src= formatConvBuffer; | |
2219 } | |
9071 | 2220 else if(srcFormat==IMGFMT_UYVY) |
2221 { | |
2222 RENAME(uyvyToY)(formatConvBuffer, src, srcW); | |
2223 src= formatConvBuffer; | |
2224 } | |
4467 | 2225 else if(srcFormat==IMGFMT_BGR32) |
2226 { | |
2227 RENAME(bgr32ToY)(formatConvBuffer, src, srcW); | |
2228 src= formatConvBuffer; | |
2229 } | |
2230 else if(srcFormat==IMGFMT_BGR24) | |
2231 { | |
2232 RENAME(bgr24ToY)(formatConvBuffer, src, srcW); | |
2233 src= formatConvBuffer; | |
2234 } | |
4578 | 2235 else if(srcFormat==IMGFMT_BGR16) |
2236 { | |
2237 RENAME(bgr16ToY)(formatConvBuffer, src, srcW); | |
2238 src= formatConvBuffer; | |
2239 } | |
4580 | 2240 else if(srcFormat==IMGFMT_BGR15) |
2241 { | |
2242 RENAME(bgr15ToY)(formatConvBuffer, src, srcW); | |
2243 src= formatConvBuffer; | |
2244 } | |
4558 | 2245 else if(srcFormat==IMGFMT_RGB32) |
2246 { | |
2247 RENAME(rgb32ToY)(formatConvBuffer, src, srcW); | |
2248 src= formatConvBuffer; | |
2249 } | |
2250 else if(srcFormat==IMGFMT_RGB24) | |
2251 { | |
2252 RENAME(rgb24ToY)(formatConvBuffer, src, srcW); | |
2253 src= formatConvBuffer; | |
2254 } | |
4467 | 2255 |
3352 | 2256 #ifdef HAVE_MMX |
11000 | 2257 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one) |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2258 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) |
3352 | 2259 #else |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2260 if(!(flags&SWS_FAST_BILINEAR)) |
3352 | 2261 #endif |
3272 | 2262 { |
2263 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); | |
2264 } | |
2265 else // Fast Bilinear upscale / crap downscale | |
2266 { | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2267 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
2469 | 2268 #ifdef HAVE_MMX2 |
2671 | 2269 int i; |
2469 | 2270 if(canMMX2BeUsed) |
2271 { | |
2272 asm volatile( | |
2273 "pxor %%mm7, %%mm7 \n\t" | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2274 "mov %0, %%"REG_c" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2275 "mov %1, %%"REG_D" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2276 "mov %2, %%"REG_d" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2277 "mov %3, %%"REG_b" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2278 "xor %%"REG_a", %%"REG_a" \n\t" // i |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2279 PREFETCH" (%%"REG_c") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2280 PREFETCH" 32(%%"REG_c") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2281 PREFETCH" 64(%%"REG_c") \n\t" |
2520 | 2282 |
14556 | 2283 #ifdef ARCH_X86_64 |
2284 | |
2469 | 2285 #define FUNNY_Y_CODE \ |
14556 | 2286 "movl (%%"REG_b"), %%esi \n\t"\ |
5452 | 2287 "call *%4 \n\t"\ |
14556 | 2288 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\ |
2289 "add %%"REG_S", %%"REG_c" \n\t"\ | |
14536
6f13379b1464
100l, fix broken AMD64 patch. To whoever applied it: Did you actually _try_
reimar
parents:
13733
diff
changeset
|
2290 "add %%"REG_a", %%"REG_D" \n\t"\ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2291 "xor %%"REG_a", %%"REG_a" \n\t"\ |
2520 | 2292 |
14556 | 2293 #else |
2294 | |
2295 #define FUNNY_Y_CODE \ | |
2296 "movl (%%"REG_b"), %%esi \n\t"\ | |
2297 "call *%4 \n\t"\ | |
2298 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\ | |
2299 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2300 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2301 | |
2302 #endif | |
2303 | |
2469 | 2304 FUNNY_Y_CODE |
2305 FUNNY_Y_CODE | |
2306 FUNNY_Y_CODE | |
2307 FUNNY_Y_CODE | |
2308 FUNNY_Y_CODE | |
2309 FUNNY_Y_CODE | |
2310 FUNNY_Y_CODE | |
2311 FUNNY_Y_CODE | |
2312 | |
5452 | 2313 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2314 "m" (funnyYCode) | |
14536
6f13379b1464
100l, fix broken AMD64 patch. To whoever applied it: Did you actually _try_
reimar
parents:
13733
diff
changeset
|
2315 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D |
2469 | 2316 ); |
3215 | 2317 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; |
2469 | 2318 } |
2319 else | |
2320 { | |
2321 #endif | |
15617
130dd060f723
one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents:
15295
diff
changeset
|
2322 int xInc_shr16 = xInc >> 16; |
130dd060f723
one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents:
15295
diff
changeset
|
2323 int xInc_mask = xInc & 0xffff; |
2469 | 2324 //NO MMX just normal asm ... |
2325 asm volatile( | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2326 "xor %%"REG_a", %%"REG_a" \n\t" // i |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2327 "xor %%"REG_b", %%"REG_b" \n\t" // xx |
2469 | 2328 "xorl %%ecx, %%ecx \n\t" // 2*xalpha |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
2329 ".balign 16 \n\t" |
2469 | 2330 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2331 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx] |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2332 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1] |
2469 | 2333 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2334 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2335 "shll $16, %%edi \n\t" | |
2336 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2337 "mov %1, %%"REG_D" \n\t" |
2469 | 2338 "shrl $9, %%esi \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2339 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t" |
2469 | 2340 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2341 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry |
2469 | 2342 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2343 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx] |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2344 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1] |
2469 | 2345 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2346 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2347 "shll $16, %%edi \n\t" | |
2348 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2349 "mov %1, %%"REG_D" \n\t" |
2469 | 2350 "shrl $9, %%esi \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2351 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t" |
2469 | 2352 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2353 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry |
2469 | 2354 |
2355 | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2356 "add $2, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2357 "cmp %2, %%"REG_a" \n\t" |
2469 | 2358 " jb 1b \n\t" |
2359 | |
2360 | |
15617
130dd060f723
one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents:
15295
diff
changeset
|
2361 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2362 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi" |
2469 | 2363 ); |
2364 #ifdef HAVE_MMX2 | |
11000 | 2365 } //if MMX2 can't be used |
2469 | 2366 #endif |
2367 #else | |
2671 | 2368 int i; |
2369 unsigned int xpos=0; | |
2370 for(i=0;i<dstWidth;i++) | |
2371 { | |
2372 register unsigned int xx=xpos>>16; | |
2373 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2374 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; | |
2375 xpos+=xInc; | |
2376 } | |
2469 | 2377 #endif |
3272 | 2378 } |
2469 | 2379 } |
2380 | |
16739
e91f944f6ed9
Change unsigned->signed and int->long, this fits the asm code better on 64
reimar
parents:
15972
diff
changeset
|
2381 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2, |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2382 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, |
4467 | 2383 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, |
5452 | 2384 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, |
2385 int32_t *mmx2FilterPos) | |
2469 | 2386 { |
4467 | 2387 if(srcFormat==IMGFMT_YUY2) |
2388 { | |
2389 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2390 src1= formatConvBuffer; | |
2391 src2= formatConvBuffer+2048; | |
2392 } | |
9071 | 2393 else if(srcFormat==IMGFMT_UYVY) |
2394 { | |
2395 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2396 src1= formatConvBuffer; | |
2397 src2= formatConvBuffer+2048; | |
2398 } | |
4467 | 2399 else if(srcFormat==IMGFMT_BGR32) |
2400 { | |
2401 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2402 src1= formatConvBuffer; | |
2403 src2= formatConvBuffer+2048; | |
2404 } | |
2405 else if(srcFormat==IMGFMT_BGR24) | |
2406 { | |
2407 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2408 src1= formatConvBuffer; | |
2409 src2= formatConvBuffer+2048; | |
2410 } | |
4578 | 2411 else if(srcFormat==IMGFMT_BGR16) |
2412 { | |
2413 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2414 src1= formatConvBuffer; | |
2415 src2= formatConvBuffer+2048; | |
2416 } | |
4580 | 2417 else if(srcFormat==IMGFMT_BGR15) |
2418 { | |
2419 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2420 src1= formatConvBuffer; | |
2421 src2= formatConvBuffer+2048; | |
2422 } | |
4558 | 2423 else if(srcFormat==IMGFMT_RGB32) |
2424 { | |
2425 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2426 src1= formatConvBuffer; | |
2427 src2= formatConvBuffer+2048; | |
2428 } | |
2429 else if(srcFormat==IMGFMT_RGB24) | |
2430 { | |
2431 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | |
2432 src1= formatConvBuffer; | |
2433 src2= formatConvBuffer+2048; | |
2434 } | |
4481 | 2435 else if(isGray(srcFormat)) |
2436 { | |
2437 return; | |
2438 } | |
4467 | 2439 |
3352 | 2440 #ifdef HAVE_MMX |
11000 | 2441 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one) |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2442 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) |
3352 | 2443 #else |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2444 if(!(flags&SWS_FAST_BILINEAR)) |
3352 | 2445 #endif |
3272 | 2446 { |
2447 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2448 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
2449 } | |
2450 else // Fast Bilinear upscale / crap downscale | |
2451 { | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2452 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
2469 | 2453 #ifdef HAVE_MMX2 |
2671 | 2454 int i; |
2469 | 2455 if(canMMX2BeUsed) |
2456 { | |
2457 asm volatile( | |
5452 | 2458 "pxor %%mm7, %%mm7 \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2459 "mov %0, %%"REG_c" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2460 "mov %1, %%"REG_D" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2461 "mov %2, %%"REG_d" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2462 "mov %3, %%"REG_b" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2463 "xor %%"REG_a", %%"REG_a" \n\t" // i |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2464 PREFETCH" (%%"REG_c") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2465 PREFETCH" 32(%%"REG_c") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2466 PREFETCH" 64(%%"REG_c") \n\t" |
5452 | 2467 |
14556 | 2468 #ifdef ARCH_X86_64 |
2469 | |
5452 | 2470 #define FUNNY_UV_CODE \ |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2471 "movl (%%"REG_b"), %%esi \n\t"\ |
5452 | 2472 "call *%4 \n\t"\ |
14556 | 2473 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\ |
2474 "add %%"REG_S", %%"REG_c" \n\t"\ | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2475 "add %%"REG_a", %%"REG_D" \n\t"\ |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2476 "xor %%"REG_a", %%"REG_a" \n\t"\ |
2469 | 2477 |
14556 | 2478 #else |
2479 | |
2480 #define FUNNY_UV_CODE \ | |
2481 "movl (%%"REG_b"), %%esi \n\t"\ | |
2482 "call *%4 \n\t"\ | |
2483 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\ | |
2484 "add %%"REG_a", %%"REG_D" \n\t"\ | |
2485 "xor %%"REG_a", %%"REG_a" \n\t"\ | |
2486 | |
2487 #endif | |
2488 | |
5452 | 2489 FUNNY_UV_CODE |
2490 FUNNY_UV_CODE | |
2491 FUNNY_UV_CODE | |
2492 FUNNY_UV_CODE | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2493 "xor %%"REG_a", %%"REG_a" \n\t" // i |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2494 "mov %5, %%"REG_c" \n\t" // src |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2495 "mov %1, %%"REG_D" \n\t" // buf1 |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2496 "add $4096, %%"REG_D" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2497 PREFETCH" (%%"REG_c") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2498 PREFETCH" 32(%%"REG_c") \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2499 PREFETCH" 64(%%"REG_c") \n\t" |
2469 | 2500 |
5452 | 2501 FUNNY_UV_CODE |
2502 FUNNY_UV_CODE | |
2503 FUNNY_UV_CODE | |
2504 FUNNY_UV_CODE | |
2469 | 2505 |
5452 | 2506 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2507 "m" (funnyUVCode), "m" (src2) | |
14556 | 2508 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D |
5452 | 2509 ); |
3344 | 2510 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) |
2469 | 2511 { |
3344 | 2512 // printf("%d %d %d\n", dstWidth, i, srcW); |
2513 dst[i] = src1[srcW-1]*128; | |
2514 dst[i+2048] = src2[srcW-1]*128; | |
2469 | 2515 } |
2516 } | |
2517 else | |
2518 { | |
2519 #endif | |
15617
130dd060f723
one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents:
15295
diff
changeset
|
2520 long xInc_shr16 = (long) (xInc >> 16); |
130dd060f723
one bugfix and a few gcc4 bug workaorunds by (Gianluigi Tiesi: mplayer, netfarm it)
michael
parents:
15295
diff
changeset
|
2521 int xInc_mask = xInc & 0xffff; |
2469 | 2522 asm volatile( |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2523 "xor %%"REG_a", %%"REG_a" \n\t" // i |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2524 "xor %%"REG_b", %%"REG_b" \n\t" // xx |
2469 | 2525 "xorl %%ecx, %%ecx \n\t" // 2*xalpha |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
2526 ".balign 16 \n\t" |
2469 | 2527 "1: \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2528 "mov %0, %%"REG_S" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2529 "movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx] |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2530 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi \n\t" //src[xx+1] |
2469 | 2531 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2532 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2533 "shll $16, %%edi \n\t" | |
2534 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2535 "mov %1, %%"REG_D" \n\t" |
2469 | 2536 "shrl $9, %%esi \n\t" |
15845 | 2537 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t" |
2469 | 2538 |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2539 "movzbl (%5, %%"REG_b"), %%edi \n\t" //src[xx] |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2540 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1] |
2469 | 2541 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] |
2542 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
2543 "shll $16, %%edi \n\t" | |
2544 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2545 "mov %1, %%"REG_D" \n\t" |
2469 | 2546 "shrl $9, %%esi \n\t" |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2547 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t" |
2469 | 2548 |
2549 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2550 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2551 "add $1, %%"REG_a" \n\t" |
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2552 "cmp %2, %%"REG_a" \n\t" |
2469 | 2553 " jb 1b \n\t" |
2554 | |
15972
e4360060b79a
Re-enables the GCC-4 fix for AMD-64 only. Patch by cartman and poirierg
gpoirier
parents:
15858
diff
changeset
|
2555 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, |
e4360060b79a
Re-enables the GCC-4 fix for AMD-64 only. Patch by cartman and poirierg
gpoirier
parents:
15858
diff
changeset
|
2556 which is needed to support GCC-4.0 */ |
e4360060b79a
Re-enables the GCC-4 fix for AMD-64 only. Patch by cartman and poirierg
gpoirier
parents:
15858
diff
changeset
|
2557 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4)) |
e4360060b79a
Re-enables the GCC-4 fix for AMD-64 only. Patch by cartman and poirierg
gpoirier
parents:
15858
diff
changeset
|
2558 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), |
e4360060b79a
Re-enables the GCC-4 fix for AMD-64 only. Patch by cartman and poirierg
gpoirier
parents:
15858
diff
changeset
|
2559 #else |
15858
045f91e5e67d
Reverts GCC-4.0 "fixe" which broke GCC-3.3 and maybe others
gpoirier
parents:
15845
diff
changeset
|
2560 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), |
15972
e4360060b79a
Re-enables the GCC-4 fix for AMD-64 only. Patch by cartman and poirierg
gpoirier
parents:
15858
diff
changeset
|
2561 #endif |
2469 | 2562 "r" (src2) |
13720
821f464b4d90
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
aurel
parents:
12698
diff
changeset
|
2563 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi" |
2469 | 2564 ); |
2565 #ifdef HAVE_MMX2 | |
11000 | 2566 } //if MMX2 can't be used |
2469 | 2567 #endif |
2568 #else | |
2671 | 2569 int i; |
2570 unsigned int xpos=0; | |
2571 for(i=0;i<dstWidth;i++) | |
2572 { | |
2573 register unsigned int xx=xpos>>16; | |
2574 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2575 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |
2576 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | |
2566 | 2577 /* slower |
2578 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; | |
2579 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; | |
2580 */ | |
2671 | 2581 xpos+=xInc; |
2582 } | |
2469 | 2583 #endif |
3272 | 2584 } |
2585 } | |
2586 | |
9499 | 2587 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
2588 int srcSliceH, uint8_t* dst[], int dstStride[]){ | |
3344 | 2589 |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2590 /* load a few things into local vars to make the code more readable? and faster */ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2591 const int srcW= c->srcW; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2592 const int dstW= c->dstW; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2593 const int dstH= c->dstH; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2594 const int chrDstW= c->chrDstW; |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2595 const int chrSrcW= c->chrSrcW; |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2596 const int lumXInc= c->lumXInc; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2597 const int chrXInc= c->chrXInc; |
4295 | 2598 const int dstFormat= c->dstFormat; |
6503 | 2599 const int srcFormat= c->srcFormat; |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2600 const int flags= c->flags; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2601 const int canMMX2BeUsed= c->canMMX2BeUsed; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2602 int16_t *vLumFilterPos= c->vLumFilterPos; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2603 int16_t *vChrFilterPos= c->vChrFilterPos; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2604 int16_t *hLumFilterPos= c->hLumFilterPos; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2605 int16_t *hChrFilterPos= c->hChrFilterPos; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2606 int16_t *vLumFilter= c->vLumFilter; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2607 int16_t *vChrFilter= c->vChrFilter; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2608 int16_t *hLumFilter= c->hLumFilter; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2609 int16_t *hChrFilter= c->hChrFilter; |
9413 | 2610 int32_t *lumMmxFilter= c->lumMmxFilter; |
2611 int32_t *chrMmxFilter= c->chrMmxFilter; | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2612 const int vLumFilterSize= c->vLumFilterSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2613 const int vChrFilterSize= c->vChrFilterSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2614 const int hLumFilterSize= c->hLumFilterSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2615 const int hChrFilterSize= c->hChrFilterSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2616 int16_t **lumPixBuf= c->lumPixBuf; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2617 int16_t **chrPixBuf= c->chrPixBuf; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2618 const int vLumBufSize= c->vLumBufSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2619 const int vChrBufSize= c->vChrBufSize; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2620 uint8_t *funnyYCode= c->funnyYCode; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2621 uint8_t *funnyUVCode= c->funnyUVCode; |
4467 | 2622 uint8_t *formatConvBuffer= c->formatConvBuffer; |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2623 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample; |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2624 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample); |
9494
543ab3909b78
sws_ prefix, more seperation between internal & external swscaler API
michael
parents:
9476
diff
changeset
|
2625 int lastDstY; |
3344 | 2626 |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2627 /* vars whch will change and which we need to storw back in the context */ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2628 int dstY= c->dstY; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2629 int lumBufIndex= c->lumBufIndex; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2630 int chrBufIndex= c->chrBufIndex; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2631 int lastInLumBuf= c->lastInLumBuf; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2632 int lastInChrBuf= c->lastInChrBuf; |
6540 | 2633 |
2634 if(isPacked(c->srcFormat)){ | |
4467 | 2635 src[0]= |
2636 src[1]= | |
9499 | 2637 src[2]= src[0]; |
6540 | 2638 srcStride[0]= |
4467 | 2639 srcStride[1]= |
9499 | 2640 srcStride[2]= srcStride[0]; |
4467 | 2641 } |
6540 | 2642 srcStride[1]<<= c->vChrDrop; |
2643 srcStride[2]<<= c->vChrDrop; | |
4419 | 2644 |
6517 | 2645 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2], |
2646 // (int)dst[0], (int)dst[1], (int)dst[2]); | |
2647 | |
2648 #if 0 //self test FIXME move to a vfilter or something | |
2649 { | |
2650 static volatile int i=0; | |
2651 i++; | |
2652 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH) | |
2653 selfTest(src, srcStride, c->srcW, c->srcH); | |
2654 i--; | |
2655 } | |
2656 #endif | |
4554 | 2657 |
2658 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2], | |
2659 //dstStride[0],dstStride[1],dstStride[2]); | |
4419 | 2660 |
2661 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0) | |
2662 { | |
2663 static int firstTime=1; //FIXME move this into the context perhaps | |
2664 if(flags & SWS_PRINT_INFO && firstTime) | |
2665 { | |
9970 | 2666 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n" |
4419 | 2667 "SwScaler: ->cannot do aligned memory acesses anymore\n"); |
2668 firstTime=0; | |
2669 } | |
2670 } | |
3344 | 2671 |
4467 | 2672 /* Note the user might start scaling the picture in the middle so this will not get executed |
2673 this is not really intended but works currently, so ppl might do it */ | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2674 if(srcSliceY ==0){ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2675 lumBufIndex=0; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2676 chrBufIndex=0; |
4467 | 2677 dstY=0; |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2678 lastInLumBuf= -1; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2679 lastInChrBuf= -1; |
3272 | 2680 } |
3344 | 2681 |
9494
543ab3909b78
sws_ prefix, more seperation between internal & external swscaler API
michael
parents:
9476
diff
changeset
|
2682 lastDstY= dstY; |
543ab3909b78
sws_ prefix, more seperation between internal & external swscaler API
michael
parents:
9476
diff
changeset
|
2683 |
3344 | 2684 for(;dstY < dstH; dstY++){ |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2685 unsigned char *dest =dst[0]+dstStride[0]*dstY; |
6520 | 2686 const int chrDstY= dstY>>c->chrDstVSubSample; |
2687 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY; | |
2688 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY; | |
3344 | 2689 |
2690 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input | |
2691 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input | |
2692 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input | |
2693 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input | |
2694 | |
11122 | 2695 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n", |
2696 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample); | |
4290
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2697 //handle holes (FAST_BILINEAR & weird filters) |
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2698 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; |
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2699 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; |
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2700 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize); |
3344 | 2701 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1) |
2702 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1) | |
2216 | 2703 |
3344 | 2704 // Do we have enough lines in this slice to output the dstY line |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2705 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample)) |
2469 | 2706 { |
3344 | 2707 //Do horizontal scaling |
2708 while(lastInLumBuf < lastLumSrcY) | |
2709 { | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2710 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; |
3344 | 2711 lumBufIndex++; |
4290
1f8ceb12284d
general convolution filtering of the source picture
michael
parents:
4276
diff
changeset
|
2712 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY); |
3344 | 2713 ASSERT(lumBufIndex < 2*vLumBufSize) |
2714 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
2715 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
2716 // printf("%d %d\n", lumBufIndex, vLumBufSize); | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2717 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2718 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, |
5452 | 2719 funnyYCode, c->srcFormat, formatConvBuffer, |
2720 c->lumMmx2Filter, c->lumMmx2FilterPos); | |
3344 | 2721 lastInLumBuf++; |
2722 } | |
2723 while(lastInChrBuf < lastChrSrcY) | |
2724 { | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2725 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2726 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; |
3344 | 2727 chrBufIndex++; |
2728 ASSERT(chrBufIndex < 2*vChrBufSize) | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2729 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)) |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2730 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0) |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2731 //FIXME replace parameters through context struct (some at least) |
6503 | 2732 |
2733 if(!(isGray(srcFormat) || isGray(dstFormat))) | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2734 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2735 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, |
5452 | 2736 funnyUVCode, c->srcFormat, formatConvBuffer, |
2737 c->chrMmx2Filter, c->chrMmx2FilterPos); | |
3344 | 2738 lastInChrBuf++; |
2739 } | |
2740 //wrap buf index around to stay inside the ring buffer | |
2741 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
2742 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
2469 | 2743 } |
3344 | 2744 else // not enough lines left in this slice -> load the rest in the buffer |
2469 | 2745 { |
3344 | 2746 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n", |
2747 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY, | |
2748 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize, | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2749 vChrBufSize, vLumBufSize);*/ |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2750 |
3344 | 2751 //Do horizontal scaling |
2752 while(lastInLumBuf+1 < srcSliceY + srcSliceH) | |
2469 | 2753 { |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2754 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; |
3344 | 2755 lumBufIndex++; |
2756 ASSERT(lumBufIndex < 2*vLumBufSize) | |
2757 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
2758 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2759 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2760 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, |
5452 | 2761 funnyYCode, c->srcFormat, formatConvBuffer, |
2762 c->lumMmx2Filter, c->lumMmx2FilterPos); | |
3344 | 2763 lastInLumBuf++; |
2469 | 2764 } |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2765 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH)) |
3344 | 2766 { |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2767 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2768 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; |
3344 | 2769 chrBufIndex++; |
2770 ASSERT(chrBufIndex < 2*vChrBufSize) | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2771 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH) |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2772 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0) |
6503 | 2773 |
2774 if(!(isGray(srcFormat) || isGray(dstFormat))) | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2775 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2776 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, |
5452 | 2777 funnyUVCode, c->srcFormat, formatConvBuffer, |
2778 c->chrMmx2Filter, c->chrMmx2FilterPos); | |
3344 | 2779 lastInChrBuf++; |
2780 } | |
2781 //wrap buf index around to stay inside the ring buffer | |
2782 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
2783 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
11000 | 2784 break; //we can't output a dstY line so let's try with the next slice |
2469 | 2785 } |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
2786 |
2748 | 2787 #ifdef HAVE_MMX |
3344 | 2788 b5Dither= dither8[dstY&1]; |
2789 g6Dither= dither4[dstY&1]; | |
2790 g5Dither= dither8[dstY&1]; | |
2791 r5Dither= dither8[(dstY+1)&1]; | |
2748 | 2792 #endif |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2793 if(dstY < dstH-2) |
3352 | 2794 { |
9414 | 2795 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; |
2796 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
2797 #ifdef HAVE_MMX | |
2798 int i; | |
2799 for(i=0; i<vLumFilterSize; i++) | |
2800 { | |
2801 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i]; | |
2802 lumMmxFilter[4*i+2]= | |
2803 lumMmxFilter[4*i+3]= | |
2804 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; | |
2805 } | |
2806 for(i=0; i<vChrFilterSize; i++) | |
2807 { | |
2808 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i]; | |
2809 chrMmxFilter[4*i+2]= | |
2810 chrMmxFilter[4*i+3]= | |
2811 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; | |
2812 } | |
2813 #endif | |
14715 | 2814 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){ |
2815 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
2816 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi | |
2817 RENAME(yuv2nv12X)(c, | |
2818 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2819 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2820 dest, uDest, dstW, chrDstW, dstFormat); | |
2821 } | |
2822 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like | |
3344 | 2823 { |
7351 | 2824 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; |
2825 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
3344 | 2826 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12 |
2827 { | |
2828 int16_t *lumBuf = lumPixBuf[0]; | |
2829 int16_t *chrBuf= chrPixBuf[0]; | |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2830 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW); |
3344 | 2831 } |
2832 else //General YV12 | |
2833 { | |
9413 | 2834 RENAME(yuv2yuvX)(c, |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2835 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2836 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, |
9414 | 2837 dest, uDest, vDest, dstW, chrDstW); |
3344 | 2838 } |
2839 } | |
2840 else | |
2841 { | |
2842 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
2843 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
2844 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB | |
2845 { | |
2846 int chrAlpha= vChrFilter[2*dstY+1]; | |
7723 | 2847 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), |
6578 | 2848 dest, dstW, chrAlpha, dstFormat, flags, dstY); |
3344 | 2849 } |
2850 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB | |
2851 { | |
2852 int lumAlpha= vLumFilter[2*dstY+1]; | |
2853 int chrAlpha= vChrFilter[2*dstY+1]; | |
7723 | 2854 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), |
6578 | 2855 dest, dstW, lumAlpha, chrAlpha, dstY); |
3344 | 2856 } |
2857 else //General RGB | |
2858 { | |
7723 | 2859 RENAME(yuv2packedX)(c, |
3344 | 2860 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, |
2861 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
9413 | 2862 dest, dstW, dstY); |
3344 | 2863 } |
2864 } | |
3352 | 2865 } |
11000 | 2866 else // hmm looks like we can't use MMX here without overwriting this array's tail |
3352 | 2867 { |
2868 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
2869 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
14715 | 2870 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){ |
2871 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | |
2872 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi | |
2873 yuv2nv12XinC( | |
2874 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2875 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2876 dest, uDest, dstW, chrDstW, dstFormat); | |
2877 } | |
2878 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 | |
3352 | 2879 { |
7351 | 2880 const int chrSkipMask= (1<<c->chrDstVSubSample)-1; |
2881 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
6540 | 2882 yuv2yuvXinC( |
6532
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2883 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, |
9834d9980c45
yvu9 support (other planar yuv formats with other chroma subsamplings should be trivial to add, if they had a IMGFMT)
michael
parents:
6520
diff
changeset
|
2884 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, |
6540 | 2885 dest, uDest, vDest, dstW, chrDstW); |
3352 | 2886 } |
2887 else | |
2888 { | |
2889 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
2890 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
7723 | 2891 yuv2packedXinC(c, |
3352 | 2892 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, |
2893 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
6578 | 2894 dest, dstW, dstY); |
3352 | 2895 } |
2896 } | |
3344 | 2897 } |
2534
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2898 |
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2899 #ifdef HAVE_MMX |
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2900 __asm __volatile(SFENCE:::"memory"); |
2566 | 2901 __asm __volatile(EMMS:::"memory"); |
2534
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2902 #endif |
4276
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2903 /* store changed local vars back in the context */ |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2904 c->dstY= dstY; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2905 c->lumBufIndex= lumBufIndex; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2906 c->chrBufIndex= chrBufIndex; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2907 c->lastInLumBuf= lastInLumBuf; |
9199d15cb4e0
removed global vars so that multiple swscalers can be used
michael
parents:
4248
diff
changeset
|
2908 c->lastInChrBuf= lastInChrBuf; |
9494
543ab3909b78
sws_ prefix, more seperation between internal & external swscaler API
michael
parents:
9476
diff
changeset
|
2909 |
543ab3909b78
sws_ prefix, more seperation between internal & external swscaler API
michael
parents:
9476
diff
changeset
|
2910 return dstY - lastDstY; |
3641 | 2911 } |