Mercurial > mplayer.hg
annotate postproc/swscale_template.c @ 3678:265680bbdcfd
mix3to2 in SSE
author | michael |
---|---|
date | Sun, 23 Dec 2001 17:32:34 +0000 |
parents | 33c560ffd3dc |
children | 666a946a71b9 |
rev | line source |
---|---|
2216 | 1 |
2 // Software scaling and colorspace conversion routines for MPlayer | |
3 | |
2269 | 4 // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu> |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
5 // current version mostly by Michael Niedermayer (michaelni@gmx.at) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
6 // the parts written by michael are under GNU GPL |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
7 |
2540 | 8 #undef MOVNTQ |
2680 | 9 #undef PAVGB |
3136 | 10 #undef PREFETCH |
11 #undef PREFETCHW | |
12 #undef EMMS | |
13 #undef SFENCE | |
14 | |
15 #ifdef HAVE_3DNOW | |
16 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
17 #define EMMS "femms" | |
18 #else | |
19 #define EMMS "emms" | |
20 #endif | |
21 | |
22 #ifdef HAVE_3DNOW | |
23 #define PREFETCH "prefetch" | |
24 #define PREFETCHW "prefetchw" | |
25 #elif defined ( HAVE_MMX2 ) | |
26 #define PREFETCH "prefetchnta" | |
27 #define PREFETCHW "prefetcht0" | |
28 #else | |
29 #define PREFETCH "/nop" | |
30 #define PREFETCHW "/nop" | |
31 #endif | |
32 | |
33 #ifdef HAVE_MMX2 | |
34 #define SFENCE "sfence" | |
35 #else | |
36 #define SFENCE "/nop" | |
37 #endif | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
38 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
39 #ifdef HAVE_MMX2 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
40 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
41 #elif defined (HAVE_3DNOW) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
42 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
43 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
44 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
45 #ifdef HAVE_MMX2 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
46 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
47 #else |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
48 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
49 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
50 |
3344 | 51 |
52 #define YSCALEYUV2YV12X(x) \ | |
53 "xorl %%eax, %%eax \n\t"\ | |
54 "pxor %%mm3, %%mm3 \n\t"\ | |
55 "pxor %%mm4, %%mm4 \n\t"\ | |
56 "movl %0, %%edx \n\t"\ | |
57 ".balign 16 \n\t" /* FIXME Unroll? */\ | |
58 "1: \n\t"\ | |
59 "movl (%1, %%edx, 4), %%esi \n\t"\ | |
60 "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\ | |
61 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\ | |
62 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\ | |
63 "pmulhw %%mm0, %%mm2 \n\t"\ | |
64 "pmulhw %%mm0, %%mm5 \n\t"\ | |
65 "paddw %%mm2, %%mm3 \n\t"\ | |
66 "paddw %%mm5, %%mm4 \n\t"\ | |
67 "addl $1, %%edx \n\t"\ | |
68 " jnz 1b \n\t"\ | |
69 "psraw $3, %%mm3 \n\t"\ | |
70 "psraw $3, %%mm4 \n\t"\ | |
71 "packuswb %%mm4, %%mm3 \n\t"\ | |
72 MOVNTQ(%%mm3, (%3, %%eax))\ | |
73 "addl $8, %%eax \n\t"\ | |
74 "cmpl %4, %%eax \n\t"\ | |
75 "pxor %%mm3, %%mm3 \n\t"\ | |
76 "pxor %%mm4, %%mm4 \n\t"\ | |
77 "movl %0, %%edx \n\t"\ | |
78 "jb 1b \n\t" | |
79 | |
80 #define YSCALEYUV2YV121 \ | |
81 "movl %2, %%eax \n\t"\ | |
82 ".balign 16 \n\t" /* FIXME Unroll? */\ | |
83 "1: \n\t"\ | |
84 "movq (%0, %%eax, 2), %%mm0 \n\t"\ | |
85 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\ | |
86 "psraw $7, %%mm0 \n\t"\ | |
87 "psraw $7, %%mm1 \n\t"\ | |
88 "packuswb %%mm1, %%mm0 \n\t"\ | |
89 MOVNTQ(%%mm0, (%1, %%eax))\ | |
90 "addl $8, %%eax \n\t"\ | |
91 "jnc 1b \n\t" | |
92 | |
93 /* | |
94 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
95 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
96 "r" (dest), "m" (dstW), | |
97 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
98 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
99 */ | |
100 #define YSCALEYUV2RGBX \ | |
101 "xorl %%eax, %%eax \n\t"\ | |
102 ".balign 16 \n\t"\ | |
103 "1: \n\t"\ | |
104 "movl %1, %%edx \n\t" /* -chrFilterSize */\ | |
105 "movl %3, %%ebx \n\t" /* chrMmxFilter+lumFilterSize */\ | |
106 "movl %7, %%ecx \n\t" /* chrSrc+lumFilterSize */\ | |
107 "pxor %%mm3, %%mm3 \n\t"\ | |
108 "pxor %%mm4, %%mm4 \n\t"\ | |
109 "2: \n\t"\ | |
110 "movl (%%ecx, %%edx, 4), %%esi \n\t"\ | |
111 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\ | |
112 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\ | |
113 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\ | |
114 "pmulhw %%mm0, %%mm2 \n\t"\ | |
115 "pmulhw %%mm0, %%mm5 \n\t"\ | |
116 "paddw %%mm2, %%mm3 \n\t"\ | |
117 "paddw %%mm5, %%mm4 \n\t"\ | |
118 "addl $1, %%edx \n\t"\ | |
119 " jnz 2b \n\t"\ | |
120 \ | |
121 "movl %0, %%edx \n\t" /* -lumFilterSize */\ | |
122 "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\ | |
123 "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\ | |
124 "pxor %%mm1, %%mm1 \n\t"\ | |
125 "pxor %%mm7, %%mm7 \n\t"\ | |
126 "2: \n\t"\ | |
127 "movl (%%ecx, %%edx, 4), %%esi \n\t"\ | |
128 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\ | |
129 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\ | |
130 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\ | |
131 "pmulhw %%mm0, %%mm2 \n\t"\ | |
132 "pmulhw %%mm0, %%mm5 \n\t"\ | |
133 "paddw %%mm2, %%mm1 \n\t"\ | |
134 "paddw %%mm5, %%mm7 \n\t"\ | |
135 "addl $1, %%edx \n\t"\ | |
136 " jnz 2b \n\t"\ | |
137 \ | |
138 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\ | |
139 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\ | |
140 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
141 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
142 "pmulhw ugCoeff, %%mm3 \n\t"\ | |
143 "pmulhw vgCoeff, %%mm4 \n\t"\ | |
144 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
145 "pmulhw ubCoeff, %%mm2 \n\t"\ | |
146 "pmulhw vrCoeff, %%mm5 \n\t"\ | |
147 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ | |
148 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\ | |
149 "pmulhw yCoeff, %%mm1 \n\t"\ | |
150 "pmulhw yCoeff, %%mm7 \n\t"\ | |
151 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
152 "paddw %%mm3, %%mm4 \n\t"\ | |
153 "movq %%mm2, %%mm0 \n\t"\ | |
154 "movq %%mm5, %%mm6 \n\t"\ | |
155 "movq %%mm4, %%mm3 \n\t"\ | |
156 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
157 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
158 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
159 "paddw %%mm1, %%mm2 \n\t"\ | |
160 "paddw %%mm1, %%mm5 \n\t"\ | |
161 "paddw %%mm1, %%mm4 \n\t"\ | |
162 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
163 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
164 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
165 "paddw %%mm7, %%mm0 \n\t"\ | |
166 "paddw %%mm7, %%mm6 \n\t"\ | |
167 "paddw %%mm7, %%mm3 \n\t"\ | |
168 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
169 "packuswb %%mm0, %%mm2 \n\t"\ | |
170 "packuswb %%mm6, %%mm5 \n\t"\ | |
171 "packuswb %%mm3, %%mm4 \n\t"\ | |
172 "pxor %%mm7, %%mm7 \n\t" | |
173 | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
174 #define FULL_YSCALEYUV2RGB \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
175 "pxor %%mm7, %%mm7 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
176 "movd %6, %%mm6 \n\t" /*yalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
177 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
178 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
179 "movd %7, %%mm5 \n\t" /*uvalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
180 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
181 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
182 "xorl %%eax, %%eax \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
183 ".balign 16 \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
184 "1: \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
185 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
186 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
187 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
188 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
189 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
190 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
191 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
192 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
193 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
194 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
195 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
196 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
197 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
198 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
199 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
200 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
201 "psubw w400, %%mm3 \n\t" /* 8(U-128)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
202 "pmulhw yCoeff, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
203 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
204 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
205 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
206 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
207 "pmulhw ubCoeff, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
208 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
209 "pmulhw ugCoeff, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
210 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
211 "psubw w400, %%mm0 \n\t" /* (V-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
212 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
213 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
214 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
215 "pmulhw vrCoeff, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
216 "pmulhw vgCoeff, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
217 "paddw %%mm1, %%mm3 \n\t" /* B*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
218 "paddw %%mm1, %%mm0 \n\t" /* R*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
219 "packuswb %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
220 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
221 "packuswb %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
222 "paddw %%mm4, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
223 "paddw %%mm2, %%mm1 \n\t" /* G*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
224 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
225 "packuswb %%mm1, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
226 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
227 #define YSCALEYUV2RGB \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
228 "movd %6, %%mm6 \n\t" /*yalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
229 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
230 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
231 "movq %%mm6, asm_yalpha1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
232 "movd %7, %%mm5 \n\t" /*uvalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
233 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
234 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
235 "movq %%mm5, asm_uvalpha1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
236 "xorl %%eax, %%eax \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
237 ".balign 16 \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
238 "1: \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
239 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
240 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
241 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
242 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
243 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
244 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
245 "movq asm_uvalpha1, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
246 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
247 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
248 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
249 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
250 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
251 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
252 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
253 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
254 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
255 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
256 "pmulhw ugCoeff, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
257 "pmulhw vgCoeff, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
258 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
259 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
260 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
261 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
262 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
263 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
264 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
265 "pmulhw asm_yalpha1, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
266 "pmulhw asm_yalpha1, %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
267 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
268 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
269 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
270 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
271 "pmulhw ubCoeff, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
272 "pmulhw vrCoeff, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
273 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
274 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
275 "pmulhw yCoeff, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
276 "pmulhw yCoeff, %%mm7 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
277 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
278 "paddw %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
279 "movq %%mm2, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
280 "movq %%mm5, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
281 "movq %%mm4, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
282 "punpcklwd %%mm2, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
283 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
284 "punpcklwd %%mm4, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
285 "paddw %%mm1, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
286 "paddw %%mm1, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
287 "paddw %%mm1, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
288 "punpckhwd %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
289 "punpckhwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
290 "punpckhwd %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
291 "paddw %%mm7, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
292 "paddw %%mm7, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
293 "paddw %%mm7, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
294 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
295 "packuswb %%mm0, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
296 "packuswb %%mm6, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
297 "packuswb %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
298 "pxor %%mm7, %%mm7 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
299 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
300 #define YSCALEYUV2RGB1 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
301 "xorl %%eax, %%eax \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
302 ".balign 16 \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
303 "1: \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
304 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
305 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
306 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
307 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
308 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
309 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
310 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
311 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
312 "pmulhw ugCoeff, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
313 "pmulhw vgCoeff, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
314 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
315 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
316 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
317 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
318 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
319 "pmulhw ubCoeff, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
320 "pmulhw vrCoeff, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
321 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
322 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
323 "pmulhw yCoeff, %%mm1 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
324 "pmulhw yCoeff, %%mm7 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
325 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
326 "paddw %%mm3, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
327 "movq %%mm2, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
328 "movq %%mm5, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
329 "movq %%mm4, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
330 "punpcklwd %%mm2, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
331 "punpcklwd %%mm5, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
332 "punpcklwd %%mm4, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
333 "paddw %%mm1, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
334 "paddw %%mm1, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
335 "paddw %%mm1, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
336 "punpckhwd %%mm0, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
337 "punpckhwd %%mm6, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
338 "punpckhwd %%mm3, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
339 "paddw %%mm7, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
340 "paddw %%mm7, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
341 "paddw %%mm7, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
342 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
343 "packuswb %%mm0, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
344 "packuswb %%mm6, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
345 "packuswb %%mm3, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
346 "pxor %%mm7, %%mm7 \n\t" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
347 |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
348 // do vertical chrominance interpolation |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
349 #define YSCALEYUV2RGB1b \ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
350 "xorl %%eax, %%eax \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
351 ".balign 16 \n\t"\ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
352 "1: \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
353 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
354 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
355 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
356 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ |
2576 | 357 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
358 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
3344 | 359 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ |
360 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
361 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
362 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
363 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
364 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
365 "pmulhw ugCoeff, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
366 "pmulhw vgCoeff, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
367 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
368 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
369 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
370 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
371 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
372 "pmulhw ubCoeff, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
373 "pmulhw vrCoeff, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
374 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
375 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
376 "pmulhw yCoeff, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
377 "pmulhw yCoeff, %%mm7 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
378 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
379 "paddw %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
380 "movq %%mm2, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
381 "movq %%mm5, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
382 "movq %%mm4, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
383 "punpcklwd %%mm2, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
384 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
385 "punpcklwd %%mm4, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
386 "paddw %%mm1, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
387 "paddw %%mm1, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
388 "paddw %%mm1, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
389 "punpckhwd %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
390 "punpckhwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
391 "punpckhwd %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
392 "paddw %%mm7, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
393 "paddw %%mm7, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
394 "paddw %%mm7, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
395 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
396 "packuswb %%mm0, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
397 "packuswb %%mm6, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
398 "packuswb %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
399 "pxor %%mm7, %%mm7 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
400 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
401 #define WRITEBGR32 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
402 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
403 "movq %%mm2, %%mm1 \n\t" /* B */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
404 "movq %%mm5, %%mm6 \n\t" /* R */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
405 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
406 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
407 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
408 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
409 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
410 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
411 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
412 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
413 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
414 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
415 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
416 MOVNTQ(%%mm0, (%4, %%eax, 4))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
417 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
418 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
419 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
420 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
421 "addl $8, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
422 "cmpl %5, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
423 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
424 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
425 #define WRITEBGR16 \ |
2669 | 426 "pand bF8, %%mm2 \n\t" /* B */\ |
427 "pand bFC, %%mm4 \n\t" /* G */\ | |
428 "pand bF8, %%mm5 \n\t" /* R */\ | |
429 "psrlq $3, %%mm2 \n\t"\ | |
430 \ | |
431 "movq %%mm2, %%mm1 \n\t"\ | |
432 "movq %%mm4, %%mm3 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
433 \ |
2669 | 434 "punpcklbw %%mm7, %%mm3 \n\t"\ |
435 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
436 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
437 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
438 \ |
2669 | 439 "psllq $3, %%mm3 \n\t"\ |
440 "psllq $3, %%mm4 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
441 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
442 "por %%mm3, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
443 "por %%mm4, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
444 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
445 MOVNTQ(%%mm2, (%4, %%eax, 2))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
446 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
447 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
448 "addl $8, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
449 "cmpl %5, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
450 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
451 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
452 #define WRITEBGR15 \ |
2669 | 453 "pand bF8, %%mm2 \n\t" /* B */\ |
454 "pand bF8, %%mm4 \n\t" /* G */\ | |
455 "pand bF8, %%mm5 \n\t" /* R */\ | |
456 "psrlq $3, %%mm2 \n\t"\ | |
457 "psrlq $1, %%mm5 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
458 \ |
2669 | 459 "movq %%mm2, %%mm1 \n\t"\ |
460 "movq %%mm4, %%mm3 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
461 \ |
2669 | 462 "punpcklbw %%mm7, %%mm3 \n\t"\ |
463 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
464 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
465 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
466 \ |
2669 | 467 "psllq $2, %%mm3 \n\t"\ |
468 "psllq $2, %%mm4 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
469 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
470 "por %%mm3, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
471 "por %%mm4, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
472 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
473 MOVNTQ(%%mm2, (%4, %%eax, 2))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
474 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
475 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
476 "addl $8, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
477 "cmpl %5, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
478 " jb 1b \n\t" |
2669 | 479 |
2730 | 480 #define WRITEBGR24OLD \ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
481 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
482 "movq %%mm2, %%mm1 \n\t" /* B */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
483 "movq %%mm5, %%mm6 \n\t" /* R */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
484 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
485 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
486 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
487 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
488 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
489 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ |
2326 | 490 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ |
491 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
492 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
493 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
494 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
495 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
496 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
497 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
498 "pand bm11111000, %%mm0 \n\t" /* 00RGB000 0.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
499 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
500 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
501 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
502 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
503 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
504 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
505 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
506 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
507 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
508 "pand bm00001111, %%mm2 \n\t" /* 0000RGBR 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
509 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
510 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
511 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
512 "pand bm11111000, %%mm1 \n\t" /* 00RGB000 2.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
513 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
514 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
515 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
516 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
517 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
518 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
519 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
520 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
521 "pand bm00000111, %%mm5 \n\t" /* 00000RGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
522 "pand bm11111000, %%mm3 \n\t" /* 00RGB000 3.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
523 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
524 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
525 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
526 \ |
2728 | 527 MOVNTQ(%%mm0, (%%ebx))\ |
528 MOVNTQ(%%mm2, 8(%%ebx))\ | |
529 MOVNTQ(%%mm3, 16(%%ebx))\ | |
530 "addl $24, %%ebx \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
531 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
532 "addl $8, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
533 "cmpl %5, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
534 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
535 |
2730 | 536 #define WRITEBGR24MMX \ |
537 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
538 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
539 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
540 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
541 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
542 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
543 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
544 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
545 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
546 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
547 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
548 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
549 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
550 \ | |
551 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
552 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ | |
553 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ | |
554 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ | |
555 \ | |
556 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ | |
557 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ | |
558 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ | |
559 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ | |
560 \ | |
561 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ | |
562 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ | |
563 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ | |
564 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ | |
565 \ | |
566 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
567 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ | |
568 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ | |
569 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
570 MOVNTQ(%%mm0, (%%ebx))\ | |
571 \ | |
572 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ | |
573 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ | |
574 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ | |
575 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ | |
576 MOVNTQ(%%mm6, 8(%%ebx))\ | |
577 \ | |
578 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ | |
579 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
580 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ | |
581 MOVNTQ(%%mm5, 16(%%ebx))\ | |
582 \ | |
583 "addl $24, %%ebx \n\t"\ | |
584 \ | |
585 "addl $8, %%eax \n\t"\ | |
586 "cmpl %5, %%eax \n\t"\ | |
587 " jb 1b \n\t" | |
588 | |
589 #define WRITEBGR24MMX2 \ | |
590 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
591 "movq M24A, %%mm0 \n\t"\ | |
592 "movq M24C, %%mm7 \n\t"\ | |
593 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ | |
594 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ | |
595 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ | |
596 \ | |
597 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ | |
598 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ | |
599 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ | |
600 \ | |
601 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ | |
602 "por %%mm1, %%mm6 \n\t"\ | |
603 "por %%mm3, %%mm6 \n\t"\ | |
604 MOVNTQ(%%mm6, (%%ebx))\ | |
605 \ | |
606 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ | |
607 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ | |
608 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ | |
609 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ | |
610 \ | |
611 "pand M24B, %%mm1 \n\t" /* B5 B4 B3 */\ | |
612 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ | |
613 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ | |
614 \ | |
615 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ | |
616 "por %%mm3, %%mm6 \n\t"\ | |
617 MOVNTQ(%%mm6, 8(%%ebx))\ | |
618 \ | |
619 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ | |
620 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ | |
621 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ | |
622 \ | |
623 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ | |
624 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ | |
625 "pand M24B, %%mm6 \n\t" /* R7 R6 R5 */\ | |
626 \ | |
627 "por %%mm1, %%mm3 \n\t"\ | |
628 "por %%mm3, %%mm6 \n\t"\ | |
629 MOVNTQ(%%mm6, 16(%%ebx))\ | |
630 \ | |
631 "addl $24, %%ebx \n\t"\ | |
632 \ | |
633 "addl $8, %%eax \n\t"\ | |
634 "cmpl %5, %%eax \n\t"\ | |
635 " jb 1b \n\t" | |
636 | |
637 #ifdef HAVE_MMX2 | |
3126 | 638 #undef WRITEBGR24 |
2730 | 639 #define WRITEBGR24 WRITEBGR24MMX2 |
640 #else | |
3126 | 641 #undef WRITEBGR24 |
2730 | 642 #define WRITEBGR24 WRITEBGR24MMX |
643 #endif | |
644 | |
3344 | 645 static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
646 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
647 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, | |
648 int16_t * lumMmxFilter, int16_t * chrMmxFilter) | |
2519 | 649 { |
3344 | 650 #ifdef HAVE_MMX |
651 if(uDest != NULL) | |
652 { | |
653 asm volatile( | |
654 YSCALEYUV2YV12X(0) | |
655 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize), | |
656 "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1) | |
657 : "%eax", "%edx", "%esi" | |
658 ); | |
2519 | 659 |
3344 | 660 asm volatile( |
661 YSCALEYUV2YV12X(4096) | |
662 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize), | |
663 "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1) | |
664 : "%eax", "%edx", "%esi" | |
665 ); | |
666 } | |
2521 | 667 |
3344 | 668 asm volatile( |
669 YSCALEYUV2YV12X(0) | |
670 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize), | |
671 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW) | |
672 : "%eax", "%edx", "%esi" | |
673 ); | |
674 #else | |
3352 | 675 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, |
676 chrFilter, chrSrc, chrFilterSize, | |
677 dest, uDest, vDest, dstW); | |
3344 | 678 #endif |
679 } | |
680 | |
681 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc, | |
682 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW) | |
683 { | |
684 #ifdef HAVE_MMX | |
685 if(uDest != NULL) | |
686 { | |
687 asm volatile( | |
688 YSCALEYUV2YV121 | |
689 :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)), | |
690 "g" (-(dstW>>1)) | |
691 : "%eax" | |
692 ); | |
693 | |
694 asm volatile( | |
695 YSCALEYUV2YV121 | |
696 :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)), | |
697 "g" (-(dstW>>1)) | |
698 : "%eax" | |
699 ); | |
2519 | 700 } |
3344 | 701 |
702 asm volatile( | |
703 YSCALEYUV2YV121 | |
704 :: "r" (lumSrc + dstW), "r" (dest + dstW), | |
705 "g" (-dstW) | |
706 : "%eax" | |
707 ); | |
708 #else | |
709 //FIXME Optimize (just quickly writen not opti..) | |
710 //FIXME replace MINMAX with LUTs | |
711 int i; | |
712 for(i=0; i<dstW; i++) | |
713 { | |
714 int val= lumSrc[i]>>7; | |
715 | |
716 dest[i]= MIN(MAX(val>>19, 0), 255); | |
717 } | |
718 | |
719 if(uDest != NULL) | |
720 for(i=0; i<(dstW>>1); i++) | |
721 { | |
722 int u=chrSrc[i]>>7; | |
723 int v=chrSrc[i + 2048]>>7; | |
724 | |
725 uDest[i]= MIN(MAX(u>>19, 0), 255); | |
726 vDest[i]= MIN(MAX(v>>19, 0), 255); | |
727 } | |
728 #endif | |
2519 | 729 } |
730 | |
3344 | 731 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
732 /** |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
733 * vertical scale YV12 to RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
734 */ |
3344 | 735 static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
736 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
737 uint8_t *dest, int dstW, int dstbpp, int16_t * lumMmxFilter, int16_t * chrMmxFilter) | |
738 { | |
739 if(fullUVIpol) | |
740 { | |
741 //FIXME | |
742 }//FULL_UV_IPOL | |
743 else | |
744 { | |
745 #ifdef HAVE_MMX | |
746 if(dstbpp == 32) //FIXME untested | |
747 { | |
748 asm volatile( | |
749 YSCALEYUV2RGBX | |
750 WRITEBGR32 | |
751 | |
752 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
753 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
754 "r" (dest), "m" (dstW), | |
755 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
756 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
757 ); | |
758 } | |
759 else if(dstbpp==24) //FIXME untested | |
760 { | |
761 asm volatile( | |
762 YSCALEYUV2RGBX | |
763 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize | |
764 "addl %4, %%ebx \n\t" | |
765 WRITEBGR24 | |
766 | |
767 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
768 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
769 "r" (dest), "m" (dstW), | |
770 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
771 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
772 ); | |
773 } | |
774 else if(dstbpp==15) | |
775 { | |
776 asm volatile( | |
777 YSCALEYUV2RGBX | |
778 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
779 #ifdef DITHER1XBPP | |
780 "paddusb b5Dither, %%mm2 \n\t" | |
781 "paddusb g5Dither, %%mm4 \n\t" | |
782 "paddusb r5Dither, %%mm5 \n\t" | |
783 #endif | |
784 | |
785 WRITEBGR15 | |
786 | |
787 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
788 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
789 "r" (dest), "m" (dstW), | |
790 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
791 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
792 ); | |
793 } | |
794 else if(dstbpp==16) | |
795 { | |
796 asm volatile( | |
797 YSCALEYUV2RGBX | |
798 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
799 #ifdef DITHER1XBPP | |
800 "paddusb b5Dither, %%mm2 \n\t" | |
801 "paddusb g6Dither, %%mm4 \n\t" | |
802 "paddusb r5Dither, %%mm5 \n\t" | |
803 #endif | |
804 | |
805 WRITEBGR16 | |
806 | |
807 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
808 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
809 "r" (dest), "m" (dstW), | |
810 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
811 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
812 ); | |
813 } | |
814 #else | |
3352 | 815 yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize, |
816 chrFilter, chrSrc, chrFilterSize, | |
817 dest, dstW, dstbpp); | |
3344 | 818 |
819 #endif | |
820 } //!FULL_UV_IPOL | |
821 } | |
822 | |
823 | |
824 /** | |
825 * vertical bilinear scale YV12 to RGB | |
826 */ | |
827 static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, | |
3209 | 828 uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstbpp) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
829 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
830 int yalpha1=yalpha^4095; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
831 int uvalpha1=uvalpha^4095; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
832 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
833 if(fullUVIpol) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
834 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
835 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
836 #ifdef HAVE_MMX |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
837 if(dstbpp == 32) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
838 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
839 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
840 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
841 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
842 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
843 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
844 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
845 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
846 "movq %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
847 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
848 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
849 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
850 MOVNTQ(%%mm3, (%4, %%eax, 4)) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
851 MOVNTQ(%%mm1, 8(%4, %%eax, 4)) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
852 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
853 "addl $4, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
854 "cmpl %5, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
855 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
856 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
857 |
3209 | 858 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
859 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
860 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
861 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
862 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
863 else if(dstbpp==24) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
864 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
865 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
866 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
867 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
868 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
869 // lsb ... msb |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
870 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
871 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
872 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
873 "movq %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
874 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
875 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
876 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
877 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
878 "psrlq $8, %%mm3 \n\t" // GR0BGR00 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
879 "pand bm00000111, %%mm2 \n\t" // BGR00000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
880 "pand bm11111000, %%mm3 \n\t" // 000BGR00 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
881 "por %%mm2, %%mm3 \n\t" // BGRBGR00 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
882 "movq %%mm1, %%mm2 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
883 "psllq $48, %%mm1 \n\t" // 000000BG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
884 "por %%mm1, %%mm3 \n\t" // BGRBGRBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
885 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
886 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
887 "psrld $16, %%mm2 \n\t" // R000R000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
888 "psrlq $24, %%mm1 \n\t" // 0BGR0000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
889 "por %%mm2, %%mm1 \n\t" // RBGRR000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
890 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
891 "movl %4, %%ebx \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
892 "addl %%eax, %%ebx \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
893 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
894 #ifdef HAVE_MMX2 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
895 //FIXME Alignment |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
896 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
897 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
898 #else |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
899 "movd %%mm3, (%%ebx, %%eax, 2) \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
900 "psrlq $32, %%mm3 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
901 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
902 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
903 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
904 "addl $4, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
905 "cmpl %5, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
906 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
907 |
3209 | 908 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
909 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
910 : "%eax", "%ebx" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
911 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
912 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
913 else if(dstbpp==15) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
914 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
915 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
916 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
917 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
918 #ifdef DITHER1XBPP |
2748 | 919 "paddusb g5Dither, %%mm1 \n\t" |
920 "paddusb r5Dither, %%mm0 \n\t" | |
921 "paddusb b5Dither, %%mm3 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
922 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
923 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
924 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
925 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
926 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
927 "psrlw $3, %%mm3 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
928 "psllw $2, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
929 "psllw $7, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
930 "pand g15Mask, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
931 "pand r15Mask, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
932 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
933 "por %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
934 "por %%mm1, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
935 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
936 MOVNTQ(%%mm0, (%4, %%eax, 2)) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
937 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
938 "addl $4, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
939 "cmpl %5, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
940 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
941 |
3209 | 942 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
943 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
944 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
945 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
946 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
947 else if(dstbpp==16) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
948 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
949 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
950 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
951 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
952 #ifdef DITHER1XBPP |
2748 | 953 "paddusb g6Dither, %%mm1 \n\t" |
954 "paddusb r5Dither, %%mm0 \n\t" | |
955 "paddusb b5Dither, %%mm3 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
956 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
957 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
958 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
959 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
960 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
961 "psrlw $3, %%mm3 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
962 "psllw $3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
963 "psllw $8, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
964 "pand g16Mask, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
965 "pand r16Mask, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
966 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
967 "por %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
968 "por %%mm1, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
969 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
970 MOVNTQ(%%mm0, (%4, %%eax, 2)) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
971 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
972 "addl $4, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
973 "cmpl %5, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
974 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
975 |
3209 | 976 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
977 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
978 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
979 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
980 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
981 #else |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
982 if(dstbpp==32 || dstbpp==24) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
983 { |
2671 | 984 int i; |
3209 | 985 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
986 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
987 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
988 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
989 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
2503 | 990 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; |
991 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
992 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
993 dest+=dstbpp>>3; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
994 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
995 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
996 else if(dstbpp==16) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
997 { |
2671 | 998 int i; |
3209 | 999 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1000 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1001 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1002 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1003 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1004 |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1005 ((uint16_t*)dest)[i] = |
2584 | 1006 clip_table16b[(Y + yuvtab_40cf[U]) >>13] | |
1007 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1008 clip_table16r[(Y + yuvtab_3343[V]) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1009 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1010 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1011 else if(dstbpp==15) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1012 { |
2671 | 1013 int i; |
3209 | 1014 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1015 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1016 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1017 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1018 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1019 |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1020 ((uint16_t*)dest)[i] = |
2584 | 1021 clip_table15b[(Y + yuvtab_40cf[U]) >>13] | |
1022 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1023 clip_table15r[(Y + yuvtab_3343[V]) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1024 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1025 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1026 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1027 }//FULL_UV_IPOL |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1028 else |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1029 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1030 #ifdef HAVE_MMX |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1031 if(dstbpp == 32) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1032 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1033 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1034 YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1035 WRITEBGR32 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1036 |
3209 | 1037 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1038 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1039 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1040 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1041 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1042 else if(dstbpp==24) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1043 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1044 asm volatile( |
2728 | 1045 "movl %4, %%ebx \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1046 YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1047 WRITEBGR24 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1048 |
3209 | 1049 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1050 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1051 : "%eax", "%ebx" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1052 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1053 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1054 else if(dstbpp==15) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1055 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1056 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1057 YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1058 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1059 #ifdef DITHER1XBPP |
2748 | 1060 "paddusb b5Dither, %%mm2 \n\t" |
1061 "paddusb g5Dither, %%mm4 \n\t" | |
1062 "paddusb r5Dither, %%mm5 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1063 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1064 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1065 WRITEBGR15 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1066 |
3209 | 1067 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1068 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1069 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1070 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1071 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1072 else if(dstbpp==16) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1073 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1074 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1075 YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1076 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1077 #ifdef DITHER1XBPP |
2748 | 1078 "paddusb b5Dither, %%mm2 \n\t" |
1079 "paddusb g6Dither, %%mm4 \n\t" | |
1080 "paddusb r5Dither, %%mm5 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1081 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1082 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1083 WRITEBGR16 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1084 |
3209 | 1085 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1086 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1087 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1088 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1089 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1090 #else |
2575 | 1091 if(dstbpp==32) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1092 { |
2671 | 1093 int i; |
3209 | 1094 for(i=0; i<dstW-1; i+=2){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1095 // vertical linear interpolation && yuv2rgb in a single step: |
2575 | 1096 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
1097 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)]; | |
2585 | 1098 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1099 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2575 | 1100 |
1101 int Cb= yuvtab_40cf[U]; | |
1102 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1103 int Cr= yuvtab_3343[V]; | |
1104 | |
1105 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)]; | |
1106 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)]; | |
1107 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)]; | |
1108 | |
1109 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)]; | |
1110 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)]; | |
1111 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)]; | |
1112 } | |
1113 } | |
3344 | 1114 else if(dstbpp==24) |
2575 | 1115 { |
2671 | 1116 int i; |
3209 | 1117 for(i=0; i<dstW-1; i+=2){ |
2575 | 1118 // vertical linear interpolation && yuv2rgb in a single step: |
1119 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1120 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)]; | |
2585 | 1121 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1122 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2575 | 1123 |
1124 int Cb= yuvtab_40cf[U]; | |
1125 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1126 int Cr= yuvtab_3343[V]; | |
1127 | |
1128 dest[0]=clip_table[((Y1 + Cb) >>13)]; | |
1129 dest[1]=clip_table[((Y1 + Cg) >>13)]; | |
1130 dest[2]=clip_table[((Y1 + Cr) >>13)]; | |
1131 | |
1132 dest[3]=clip_table[((Y2 + Cb) >>13)]; | |
1133 dest[4]=clip_table[((Y2 + Cg) >>13)]; | |
1134 dest[5]=clip_table[((Y2 + Cr) >>13)]; | |
1135 dest+=6; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1136 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1137 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1138 else if(dstbpp==16) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1139 { |
2671 | 1140 int i; |
3209 | 1141 for(i=0; i<dstW-1; i+=2){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1142 // vertical linear interpolation && yuv2rgb in a single step: |
2575 | 1143 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
1144 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)]; | |
2585 | 1145 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1146 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1147 |
2575 | 1148 int Cb= yuvtab_40cf[U]; |
1149 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1150 int Cr= yuvtab_3343[V]; | |
1151 | |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1152 ((uint16_t*)dest)[i] = |
2584 | 1153 clip_table16b[(Y1 + Cb) >>13] | |
1154 clip_table16g[(Y1 + Cg) >>13] | | |
1155 clip_table16r[(Y1 + Cr) >>13]; | |
2575 | 1156 |
1157 ((uint16_t*)dest)[i+1] = | |
2584 | 1158 clip_table16b[(Y2 + Cb) >>13] | |
1159 clip_table16g[(Y2 + Cg) >>13] | | |
1160 clip_table16r[(Y2 + Cr) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1161 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1162 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1163 else if(dstbpp==15) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1164 { |
2671 | 1165 int i; |
3209 | 1166 for(i=0; i<dstW-1; i+=2){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1167 // vertical linear interpolation && yuv2rgb in a single step: |
2575 | 1168 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
1169 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)]; | |
2585 | 1170 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1171 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1172 |
2575 | 1173 int Cb= yuvtab_40cf[U]; |
1174 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1175 int Cr= yuvtab_3343[V]; | |
1176 | |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1177 ((uint16_t*)dest)[i] = |
2584 | 1178 clip_table15b[(Y1 + Cb) >>13] | |
1179 clip_table15g[(Y1 + Cg) >>13] | | |
1180 clip_table15r[(Y1 + Cr) >>13]; | |
1181 | |
2575 | 1182 ((uint16_t*)dest)[i+1] = |
2584 | 1183 clip_table15b[(Y2 + Cb) >>13] | |
1184 clip_table15g[(Y2 + Cg) >>13] | | |
1185 clip_table15r[(Y2 + Cr) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1186 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1187 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1188 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1189 } //!FULL_UV_IPOL |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1190 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1191 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1192 /** |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1193 * YV12 to RGB without scaling or interpolating |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1194 */ |
3344 | 1195 static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, |
1196 uint8_t *dest, int dstW, int uvalpha, int dstbpp) | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1197 { |
2671 | 1198 int uvalpha1=uvalpha^4095; |
3344 | 1199 const int yalpha1=0; |
2671 | 1200 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1201 if(fullUVIpol || allwaysIpol) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1202 { |
3344 | 1203 RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstbpp); |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1204 return; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1205 } |
2576 | 1206 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1207 #ifdef HAVE_MMX |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1208 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1209 { |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1210 if(dstbpp == 32) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1211 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1212 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1213 YSCALEYUV2RGB1 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1214 WRITEBGR32 |
3344 | 1215 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1216 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1217 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1218 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1219 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1220 else if(dstbpp==24) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1221 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1222 asm volatile( |
2728 | 1223 "movl %4, %%ebx \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1224 YSCALEYUV2RGB1 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1225 WRITEBGR24 |
3344 | 1226 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1227 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1228 : "%eax", "%ebx" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1229 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1230 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1231 else if(dstbpp==15) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1232 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1233 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1234 YSCALEYUV2RGB1 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1235 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1236 #ifdef DITHER1XBPP |
2748 | 1237 "paddusb b5Dither, %%mm2 \n\t" |
1238 "paddusb g5Dither, %%mm4 \n\t" | |
1239 "paddusb r5Dither, %%mm5 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1240 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1241 WRITEBGR15 |
3344 | 1242 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1243 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1244 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1245 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1246 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1247 else if(dstbpp==16) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1248 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1249 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1250 YSCALEYUV2RGB1 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1251 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1252 #ifdef DITHER1XBPP |
2748 | 1253 "paddusb b5Dither, %%mm2 \n\t" |
1254 "paddusb g6Dither, %%mm4 \n\t" | |
1255 "paddusb r5Dither, %%mm5 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1256 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1257 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1258 WRITEBGR16 |
3344 | 1259 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1260 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1261 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1262 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1263 } |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1264 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1265 else |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1266 { |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1267 if(dstbpp == 32) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1268 { |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1269 asm volatile( |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1270 YSCALEYUV2RGB1b |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1271 WRITEBGR32 |
3344 | 1272 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1273 "m" (yalpha1), "m" (uvalpha1) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1274 : "%eax" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1275 ); |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1276 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1277 else if(dstbpp==24) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1278 { |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1279 asm volatile( |
2728 | 1280 "movl %4, %%ebx \n\t" |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1281 YSCALEYUV2RGB1b |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1282 WRITEBGR24 |
3344 | 1283 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1284 "m" (yalpha1), "m" (uvalpha1) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1285 : "%eax", "%ebx" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1286 ); |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1287 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1288 else if(dstbpp==15) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1289 { |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1290 asm volatile( |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1291 YSCALEYUV2RGB1b |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1292 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1293 #ifdef DITHER1XBPP |
2748 | 1294 "paddusb b5Dither, %%mm2 \n\t" |
1295 "paddusb g5Dither, %%mm4 \n\t" | |
1296 "paddusb r5Dither, %%mm5 \n\t" | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1297 #endif |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1298 WRITEBGR15 |
3344 | 1299 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1300 "m" (yalpha1), "m" (uvalpha1) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1301 : "%eax" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1302 ); |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1303 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1304 else if(dstbpp==16) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1305 { |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1306 asm volatile( |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1307 YSCALEYUV2RGB1b |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1308 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1309 #ifdef DITHER1XBPP |
2748 | 1310 "paddusb b5Dither, %%mm2 \n\t" |
1311 "paddusb g6Dither, %%mm4 \n\t" | |
1312 "paddusb r5Dither, %%mm5 \n\t" | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1313 #endif |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1314 |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1315 WRITEBGR16 |
3344 | 1316 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1317 "m" (yalpha1), "m" (uvalpha1) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1318 : "%eax" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1319 ); |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1320 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1321 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1322 #else |
2576 | 1323 //FIXME write 2 versions (for even & odd lines) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1324 |
2576 | 1325 if(dstbpp==32) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1326 { |
2671 | 1327 int i; |
3209 | 1328 for(i=0; i<dstW-1; i+=2){ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1329 // vertical linear interpolation && yuv2rgb in a single step: |
2576 | 1330 int Y1=yuvtab_2568[buf0[i]>>7]; |
1331 int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
2585 | 1332 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1333 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2576 | 1334 |
1335 int Cb= yuvtab_40cf[U]; | |
1336 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1337 int Cr= yuvtab_3343[V]; | |
1338 | |
1339 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)]; | |
1340 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)]; | |
1341 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)]; | |
1342 | |
1343 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)]; | |
1344 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)]; | |
1345 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)]; | |
1346 } | |
1347 } | |
3344 | 1348 else if(dstbpp==24) |
2576 | 1349 { |
2671 | 1350 int i; |
3209 | 1351 for(i=0; i<dstW-1; i+=2){ |
2576 | 1352 // vertical linear interpolation && yuv2rgb in a single step: |
1353 int Y1=yuvtab_2568[buf0[i]>>7]; | |
1354 int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
2585 | 1355 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1356 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2576 | 1357 |
1358 int Cb= yuvtab_40cf[U]; | |
1359 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1360 int Cr= yuvtab_3343[V]; | |
1361 | |
1362 dest[0]=clip_table[((Y1 + Cb) >>13)]; | |
1363 dest[1]=clip_table[((Y1 + Cg) >>13)]; | |
1364 dest[2]=clip_table[((Y1 + Cr) >>13)]; | |
1365 | |
1366 dest[3]=clip_table[((Y2 + Cb) >>13)]; | |
1367 dest[4]=clip_table[((Y2 + Cg) >>13)]; | |
1368 dest[5]=clip_table[((Y2 + Cr) >>13)]; | |
1369 dest+=6; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1370 } |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1371 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1372 else if(dstbpp==16) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1373 { |
2671 | 1374 int i; |
3209 | 1375 for(i=0; i<dstW-1; i+=2){ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1376 // vertical linear interpolation && yuv2rgb in a single step: |
2576 | 1377 int Y1=yuvtab_2568[buf0[i]>>7]; |
1378 int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
2585 | 1379 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1380 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1381 |
2576 | 1382 int Cb= yuvtab_40cf[U]; |
1383 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1384 int Cr= yuvtab_3343[V]; | |
1385 | |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1386 ((uint16_t*)dest)[i] = |
2584 | 1387 clip_table16b[(Y1 + Cb) >>13] | |
1388 clip_table16g[(Y1 + Cg) >>13] | | |
1389 clip_table16r[(Y1 + Cr) >>13]; | |
2576 | 1390 |
1391 ((uint16_t*)dest)[i+1] = | |
2584 | 1392 clip_table16b[(Y2 + Cb) >>13] | |
1393 clip_table16g[(Y2 + Cg) >>13] | | |
1394 clip_table16r[(Y2 + Cr) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1395 } |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1396 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1397 else if(dstbpp==15) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1398 { |
2671 | 1399 int i; |
3209 | 1400 for(i=0; i<dstW-1; i+=2){ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1401 // vertical linear interpolation && yuv2rgb in a single step: |
2576 | 1402 int Y1=yuvtab_2568[buf0[i]>>7]; |
1403 int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
2585 | 1404 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1405 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1406 |
2576 | 1407 int Cb= yuvtab_40cf[U]; |
1408 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1409 int Cr= yuvtab_3343[V]; | |
1410 | |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1411 ((uint16_t*)dest)[i] = |
2584 | 1412 clip_table15b[(Y1 + Cb) >>13] | |
1413 clip_table15g[(Y1 + Cg) >>13] | | |
1414 clip_table15r[(Y1 + Cr) >>13]; | |
1415 | |
2576 | 1416 ((uint16_t*)dest)[i+1] = |
2584 | 1417 clip_table15b[(Y2 + Cb) >>13] | |
1418 clip_table15g[(Y2 + Cg) >>13] | | |
1419 clip_table15r[(Y2 + Cr) >>13]; | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1420 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1421 } |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1422 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1423 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1424 |
3272 | 1425 // Bilinear / Bicubic scaling |
1426 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, | |
1427 int16_t *filter, int16_t *filterPos, int filterSize) | |
1428 { | |
1429 #ifdef HAVE_MMX | |
1430 if(filterSize==4) // allways true for upscaling, sometimes for down too | |
1431 { | |
1432 int counter= -2*dstW; | |
1433 filter-= counter*2; | |
1434 filterPos-= counter/2; | |
1435 dst-= counter/2; | |
1436 asm volatile( | |
1437 "pxor %%mm7, %%mm7 \n\t" | |
1438 "movq w02, %%mm6 \n\t" | |
1439 "pushl %%ebp \n\t" // we use 7 regs here ... | |
1440 "movl %%eax, %%ebp \n\t" | |
1441 ".balign 16 \n\t" | |
1442 "1: \n\t" | |
1443 "movzwl (%2, %%ebp), %%eax \n\t" | |
1444 "movzwl 2(%2, %%ebp), %%ebx \n\t" | |
1445 "movq (%1, %%ebp, 4), %%mm1 \n\t" | |
1446 "movq 8(%1, %%ebp, 4), %%mm3 \n\t" | |
1447 "movd (%3, %%eax), %%mm0 \n\t" | |
1448 "movd (%3, %%ebx), %%mm2 \n\t" | |
1449 "punpcklbw %%mm7, %%mm0 \n\t" | |
1450 "punpcklbw %%mm7, %%mm2 \n\t" | |
1451 "pmaddwd %%mm1, %%mm0 \n\t" | |
1452 "pmaddwd %%mm2, %%mm3 \n\t" | |
1453 "psrad $8, %%mm0 \n\t" | |
1454 "psrad $8, %%mm3 \n\t" | |
1455 "packssdw %%mm3, %%mm0 \n\t" | |
1456 "pmaddwd %%mm6, %%mm0 \n\t" | |
1457 "packssdw %%mm0, %%mm0 \n\t" | |
1458 "movd %%mm0, (%4, %%ebp) \n\t" | |
1459 "addl $4, %%ebp \n\t" | |
1460 " jnc 1b \n\t" | |
3352 | 1461 |
3272 | 1462 "popl %%ebp \n\t" |
1463 : "+a" (counter) | |
1464 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
1465 : "%ebx" | |
1466 ); | |
1467 } | |
1468 else if(filterSize==8) | |
1469 { | |
1470 int counter= -2*dstW; | |
1471 filter-= counter*4; | |
1472 filterPos-= counter/2; | |
1473 dst-= counter/2; | |
1474 asm volatile( | |
1475 "pxor %%mm7, %%mm7 \n\t" | |
1476 "movq w02, %%mm6 \n\t" | |
1477 "pushl %%ebp \n\t" // we use 7 regs here ... | |
1478 "movl %%eax, %%ebp \n\t" | |
1479 ".balign 16 \n\t" | |
1480 "1: \n\t" | |
1481 "movzwl (%2, %%ebp), %%eax \n\t" | |
1482 "movzwl 2(%2, %%ebp), %%ebx \n\t" | |
1483 "movq (%1, %%ebp, 8), %%mm1 \n\t" | |
1484 "movq 16(%1, %%ebp, 8), %%mm3 \n\t" | |
1485 "movd (%3, %%eax), %%mm0 \n\t" | |
1486 "movd (%3, %%ebx), %%mm2 \n\t" | |
1487 "punpcklbw %%mm7, %%mm0 \n\t" | |
1488 "punpcklbw %%mm7, %%mm2 \n\t" | |
1489 "pmaddwd %%mm1, %%mm0 \n\t" | |
1490 "pmaddwd %%mm2, %%mm3 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1491 |
3272 | 1492 "movq 8(%1, %%ebp, 8), %%mm1 \n\t" |
1493 "movq 24(%1, %%ebp, 8), %%mm5 \n\t" | |
1494 "movd 4(%3, %%eax), %%mm4 \n\t" | |
1495 "movd 4(%3, %%ebx), %%mm2 \n\t" | |
1496 "punpcklbw %%mm7, %%mm4 \n\t" | |
1497 "punpcklbw %%mm7, %%mm2 \n\t" | |
1498 "pmaddwd %%mm1, %%mm4 \n\t" | |
1499 "pmaddwd %%mm2, %%mm5 \n\t" | |
1500 "paddd %%mm4, %%mm0 \n\t" | |
1501 "paddd %%mm5, %%mm3 \n\t" | |
1502 | |
1503 "psrad $8, %%mm0 \n\t" | |
1504 "psrad $8, %%mm3 \n\t" | |
1505 "packssdw %%mm3, %%mm0 \n\t" | |
1506 "pmaddwd %%mm6, %%mm0 \n\t" | |
1507 "packssdw %%mm0, %%mm0 \n\t" | |
1508 "movd %%mm0, (%4, %%ebp) \n\t" | |
1509 "addl $4, %%ebp \n\t" | |
1510 " jnc 1b \n\t" | |
3344 | 1511 |
3272 | 1512 "popl %%ebp \n\t" |
1513 : "+a" (counter) | |
1514 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
1515 : "%ebx" | |
1516 ); | |
1517 } | |
1518 else | |
1519 { | |
1520 int counter= -2*dstW; | |
1521 // filter-= counter*filterSize/2; | |
1522 filterPos-= counter/2; | |
1523 dst-= counter/2; | |
1524 asm volatile( | |
1525 "pxor %%mm7, %%mm7 \n\t" | |
1526 "movq w02, %%mm6 \n\t" | |
1527 ".balign 16 \n\t" | |
1528 "1: \n\t" | |
1529 "movl %2, %%ecx \n\t" | |
1530 "movzwl (%%ecx, %0), %%eax \n\t" | |
1531 "movzwl 2(%%ecx, %0), %%ebx \n\t" | |
1532 "movl %5, %%ecx \n\t" | |
1533 "pxor %%mm4, %%mm4 \n\t" | |
1534 "pxor %%mm5, %%mm5 \n\t" | |
1535 "2: \n\t" | |
1536 "movq (%1), %%mm1 \n\t" | |
1537 "movq (%1, %6), %%mm3 \n\t" | |
1538 "movd (%%ecx, %%eax), %%mm0 \n\t" | |
1539 "movd (%%ecx, %%ebx), %%mm2 \n\t" | |
1540 "punpcklbw %%mm7, %%mm0 \n\t" | |
1541 "punpcklbw %%mm7, %%mm2 \n\t" | |
1542 "pmaddwd %%mm1, %%mm0 \n\t" | |
1543 "pmaddwd %%mm2, %%mm3 \n\t" | |
1544 "paddd %%mm3, %%mm5 \n\t" | |
1545 "paddd %%mm0, %%mm4 \n\t" | |
1546 "addl $8, %1 \n\t" | |
1547 "addl $4, %%ecx \n\t" | |
1548 "cmpl %4, %%ecx \n\t" | |
1549 " jb 2b \n\t" | |
1550 "addl %6, %1 \n\t" | |
1551 "psrad $8, %%mm4 \n\t" | |
1552 "psrad $8, %%mm5 \n\t" | |
1553 "packssdw %%mm5, %%mm4 \n\t" | |
1554 "pmaddwd %%mm6, %%mm4 \n\t" | |
1555 "packssdw %%mm4, %%mm4 \n\t" | |
1556 "movl %3, %%eax \n\t" | |
1557 "movd %%mm4, (%%eax, %0) \n\t" | |
1558 "addl $4, %0 \n\t" | |
1559 " jnc 1b \n\t" | |
3344 | 1560 |
3641 | 1561 : "+r" (counter), "+r" (filter) |
1562 : "m" (filterPos), "m" (dst), "m"(src+filterSize), | |
3272 | 1563 "m" (src), "r" (filterSize*2) |
3299 | 1564 : "%ebx", "%eax", "%ecx" |
3272 | 1565 ); |
1566 } | |
1567 #else | |
1568 int i; | |
1569 for(i=0; i<dstW; i++) | |
1570 { | |
1571 int j; | |
1572 int srcPos= filterPos[i]; | |
1573 int val=0; | |
3344 | 1574 // printf("filterPos: %d\n", filterPos[i]); |
3272 | 1575 for(j=0; j<filterSize; j++) |
1576 { | |
1577 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); | |
1578 val += ((int)src[srcPos + j])*filter[filterSize*i + j]; | |
1579 } | |
1580 // filter += hFilterSize; | |
1581 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ... | |
1582 // dst[i] = val>>7; | |
1583 } | |
1584 #endif | |
1585 } | |
1586 // *** horizontal scale Y line to temp buffer | |
3215 | 1587 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc) |
2469 | 1588 { |
3352 | 1589 #ifdef HAVE_MMX |
1590 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one) | |
1591 if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed)) | |
1592 #else | |
3272 | 1593 if(sws_flags != SWS_FAST_BILINEAR) |
3352 | 1594 #endif |
3272 | 1595 { |
1596 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); | |
1597 } | |
1598 else // Fast Bilinear upscale / crap downscale | |
1599 { | |
2469 | 1600 #ifdef ARCH_X86 |
1601 #ifdef HAVE_MMX2 | |
2671 | 1602 int i; |
2469 | 1603 if(canMMX2BeUsed) |
1604 { | |
1605 asm volatile( | |
1606 "pxor %%mm7, %%mm7 \n\t" | |
1607 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |
1608 "movd %5, %%mm6 \n\t" // xInc&0xFFFF | |
1609 "punpcklwd %%mm6, %%mm6 \n\t" | |
1610 "punpcklwd %%mm6, %%mm6 \n\t" | |
1611 "movq %%mm6, %%mm2 \n\t" | |
1612 "psllq $16, %%mm2 \n\t" | |
1613 "paddw %%mm6, %%mm2 \n\t" | |
1614 "psllq $16, %%mm2 \n\t" | |
1615 "paddw %%mm6, %%mm2 \n\t" | |
1616 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF | |
1617 "movq %%mm2, temp0 \n\t" | |
1618 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF | |
1619 "punpcklwd %%mm6, %%mm6 \n\t" | |
1620 "punpcklwd %%mm6, %%mm6 \n\t" | |
1621 "xorl %%eax, %%eax \n\t" // i | |
1622 "movl %0, %%esi \n\t" // src | |
1623 "movl %1, %%edi \n\t" // buf1 | |
1624 "movl %3, %%edx \n\t" // (xInc*4)>>16 | |
1625 "xorl %%ecx, %%ecx \n\t" | |
1626 "xorl %%ebx, %%ebx \n\t" | |
1627 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF | |
2520 | 1628 |
2469 | 1629 #define FUNNY_Y_CODE \ |
2520 | 1630 PREFETCH" 1024(%%esi) \n\t"\ |
1631 PREFETCH" 1056(%%esi) \n\t"\ | |
1632 PREFETCH" 1088(%%esi) \n\t"\ | |
2469 | 1633 "call funnyYCode \n\t"\ |
1634 "movq temp0, %%mm2 \n\t"\ | |
1635 "xorl %%ecx, %%ecx \n\t" | |
2520 | 1636 |
2469 | 1637 FUNNY_Y_CODE |
1638 FUNNY_Y_CODE | |
1639 FUNNY_Y_CODE | |
1640 FUNNY_Y_CODE | |
1641 FUNNY_Y_CODE | |
1642 FUNNY_Y_CODE | |
1643 FUNNY_Y_CODE | |
1644 FUNNY_Y_CODE | |
1645 | |
1646 :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16), | |
1647 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF) | |
1648 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |
1649 ); | |
3215 | 1650 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; |
2469 | 1651 } |
1652 else | |
1653 { | |
1654 #endif | |
1655 //NO MMX just normal asm ... | |
1656 asm volatile( | |
1657 "xorl %%eax, %%eax \n\t" // i | |
1658 "xorl %%ebx, %%ebx \n\t" // xx | |
1659 "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
1660 ".balign 16 \n\t" |
2469 | 1661 "1: \n\t" |
1662 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |
1663 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |
1664 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
1665 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
1666 "shll $16, %%edi \n\t" | |
1667 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
1668 "movl %1, %%edi \n\t" | |
1669 "shrl $9, %%esi \n\t" | |
1670 "movw %%si, (%%edi, %%eax, 2) \n\t" | |
1671 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
1672 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry | |
1673 | |
1674 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |
1675 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |
1676 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
1677 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
1678 "shll $16, %%edi \n\t" | |
1679 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
1680 "movl %1, %%edi \n\t" | |
1681 "shrl $9, %%esi \n\t" | |
1682 "movw %%si, 2(%%edi, %%eax, 2) \n\t" | |
1683 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
1684 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry | |
1685 | |
1686 | |
1687 "addl $2, %%eax \n\t" | |
1688 "cmpl %2, %%eax \n\t" | |
1689 " jb 1b \n\t" | |
1690 | |
1691 | |
1692 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF) | |
1693 : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |
1694 ); | |
1695 #ifdef HAVE_MMX2 | |
1696 } //if MMX2 cant be used | |
1697 #endif | |
1698 #else | |
2671 | 1699 int i; |
1700 unsigned int xpos=0; | |
1701 for(i=0;i<dstWidth;i++) | |
1702 { | |
1703 register unsigned int xx=xpos>>16; | |
1704 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
1705 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; | |
1706 xpos+=xInc; | |
1707 } | |
2469 | 1708 #endif |
3272 | 1709 } |
2469 | 1710 } |
1711 | |
3126 | 1712 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, |
3215 | 1713 uint8_t *src1, uint8_t *src2, int srcW, int xInc) |
2469 | 1714 { |
3352 | 1715 #ifdef HAVE_MMX |
1716 // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one) | |
1717 if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed)) | |
1718 #else | |
3272 | 1719 if(sws_flags != SWS_FAST_BILINEAR) |
3352 | 1720 #endif |
3272 | 1721 { |
1722 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
1723 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
1724 } | |
1725 else // Fast Bilinear upscale / crap downscale | |
1726 { | |
2469 | 1727 #ifdef ARCH_X86 |
1728 #ifdef HAVE_MMX2 | |
2671 | 1729 int i; |
2469 | 1730 if(canMMX2BeUsed) |
1731 { | |
1732 asm volatile( | |
1733 "pxor %%mm7, %%mm7 \n\t" | |
1734 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |
1735 "movd %5, %%mm6 \n\t" // xInc&0xFFFF | |
1736 "punpcklwd %%mm6, %%mm6 \n\t" | |
1737 "punpcklwd %%mm6, %%mm6 \n\t" | |
1738 "movq %%mm6, %%mm2 \n\t" | |
1739 "psllq $16, %%mm2 \n\t" | |
1740 "paddw %%mm6, %%mm2 \n\t" | |
1741 "psllq $16, %%mm2 \n\t" | |
1742 "paddw %%mm6, %%mm2 \n\t" | |
1743 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF | |
1744 "movq %%mm2, temp0 \n\t" | |
1745 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF | |
1746 "punpcklwd %%mm6, %%mm6 \n\t" | |
1747 "punpcklwd %%mm6, %%mm6 \n\t" | |
1748 "xorl %%eax, %%eax \n\t" // i | |
1749 "movl %0, %%esi \n\t" // src | |
1750 "movl %1, %%edi \n\t" // buf1 | |
1751 "movl %3, %%edx \n\t" // (xInc*4)>>16 | |
1752 "xorl %%ecx, %%ecx \n\t" | |
1753 "xorl %%ebx, %%ebx \n\t" | |
1754 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF | |
1755 | |
1756 #define FUNNYUVCODE \ | |
2520 | 1757 PREFETCH" 1024(%%esi) \n\t"\ |
1758 PREFETCH" 1056(%%esi) \n\t"\ | |
1759 PREFETCH" 1088(%%esi) \n\t"\ | |
2469 | 1760 "call funnyUVCode \n\t"\ |
1761 "movq temp0, %%mm2 \n\t"\ | |
1762 "xorl %%ecx, %%ecx \n\t" | |
1763 | |
1764 FUNNYUVCODE | |
1765 FUNNYUVCODE | |
1766 FUNNYUVCODE | |
1767 FUNNYUVCODE | |
1768 | |
1769 FUNNYUVCODE | |
1770 FUNNYUVCODE | |
1771 FUNNYUVCODE | |
1772 FUNNYUVCODE | |
1773 "xorl %%eax, %%eax \n\t" // i | |
1774 "movl %6, %%esi \n\t" // src | |
1775 "movl %1, %%edi \n\t" // buf1 | |
1776 "addl $4096, %%edi \n\t" | |
1777 | |
1778 FUNNYUVCODE | |
1779 FUNNYUVCODE | |
1780 FUNNYUVCODE | |
1781 FUNNYUVCODE | |
1782 | |
1783 FUNNYUVCODE | |
1784 FUNNYUVCODE | |
1785 FUNNYUVCODE | |
1786 FUNNYUVCODE | |
1787 | |
1788 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16), | |
1789 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2) | |
1790 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |
1791 ); | |
3344 | 1792 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) |
2469 | 1793 { |
3344 | 1794 // printf("%d %d %d\n", dstWidth, i, srcW); |
1795 dst[i] = src1[srcW-1]*128; | |
1796 dst[i+2048] = src2[srcW-1]*128; | |
2469 | 1797 } |
1798 } | |
1799 else | |
1800 { | |
1801 #endif | |
1802 asm volatile( | |
1803 "xorl %%eax, %%eax \n\t" // i | |
1804 "xorl %%ebx, %%ebx \n\t" // xx | |
1805 "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
1806 ".balign 16 \n\t" |
2469 | 1807 "1: \n\t" |
1808 "movl %0, %%esi \n\t" | |
1809 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx] | |
1810 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1] | |
1811 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
1812 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
1813 "shll $16, %%edi \n\t" | |
1814 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
1815 "movl %1, %%edi \n\t" | |
1816 "shrl $9, %%esi \n\t" | |
1817 "movw %%si, (%%edi, %%eax, 2) \n\t" | |
1818 | |
1819 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] | |
1820 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] | |
1821 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
1822 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
1823 "shll $16, %%edi \n\t" | |
1824 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
1825 "movl %1, %%edi \n\t" | |
1826 "shrl $9, %%esi \n\t" | |
1827 "movw %%si, 4096(%%edi, %%eax, 2)\n\t" | |
1828 | |
1829 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
1830 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry | |
1831 "addl $1, %%eax \n\t" | |
1832 "cmpl %2, %%eax \n\t" | |
1833 " jb 1b \n\t" | |
1834 | |
1835 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF), | |
1836 "r" (src2) | |
1837 : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |
1838 ); | |
1839 #ifdef HAVE_MMX2 | |
1840 } //if MMX2 cant be used | |
1841 #endif | |
1842 #else | |
2671 | 1843 int i; |
1844 unsigned int xpos=0; | |
1845 for(i=0;i<dstWidth;i++) | |
1846 { | |
1847 register unsigned int xx=xpos>>16; | |
1848 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
1849 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |
1850 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | |
2566 | 1851 /* slower |
1852 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; | |
1853 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; | |
1854 */ | |
2671 | 1855 xpos+=xInc; |
1856 } | |
2469 | 1857 #endif |
3272 | 1858 } |
1859 } | |
1860 | |
3352 | 1861 static inline void RENAME(initFilter)(int16_t *dstFilter, int16_t *filterPos, int *filterSize, int xInc, |
3344 | 1862 int srcW, int dstW, int filterAlign, int one) |
3272 | 1863 { |
1864 int i; | |
3352 | 1865 double filter[8000]; |
3272 | 1866 #ifdef HAVE_MMX |
3352 | 1867 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions) |
3272 | 1868 #endif |
3344 | 1869 |
1870 if(ABS(xInc - 0x10000) <10) // unscaled | |
1871 { | |
1872 int i; | |
1873 *filterSize= (1 +(filterAlign-1)) & (~(filterAlign-1)); // 1 or 4 normaly | |
1874 for(i=0; i<dstW*(*filterSize); i++) filter[i]=0; | |
1875 | |
1876 for(i=0; i<dstW; i++) | |
1877 { | |
1878 filter[i*(*filterSize)]=1; | |
1879 filterPos[i]=i; | |
1880 } | |
1881 | |
1882 } | |
1883 else if(xInc <= (1<<16) || sws_flags==SWS_FAST_BILINEAR) // upscale | |
3272 | 1884 { |
1885 int i; | |
1886 int xDstInSrc; | |
1887 if(sws_flags==SWS_BICUBIC) *filterSize= 4; | |
1888 else *filterSize= 2; | |
1889 // printf("%d %d %d\n", filterSize, srcW, dstW); | |
3344 | 1890 *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1)); |
1891 | |
3272 | 1892 xDstInSrc= xInc - 0x8000; |
1893 for(i=0; i<dstW; i++) | |
1894 { | |
1895 int xx= (xDstInSrc>>16) - (*filterSize>>1) + 1; | |
1896 int j; | |
3344 | 1897 |
3272 | 1898 filterPos[i]= xx; |
1899 if(sws_flags == SWS_BICUBIC) | |
3344 | 1900 { |
3272 | 1901 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16); |
3352 | 1902 double y1,y2,y3,y4; |
3272 | 1903 double A= -0.75; |
1904 // Equation is from VirtualDub | |
3352 | 1905 y1 = ( + A*d - 2.0*A*d*d + A*d*d*d); |
1906 y2 = (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d); | |
1907 y3 = ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d); | |
1908 y4 = ( + A*d*d - A*d*d*d); | |
3344 | 1909 |
3272 | 1910 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc); |
1911 filter[i*(*filterSize) + 0]= y1; | |
1912 filter[i*(*filterSize) + 1]= y2; | |
1913 filter[i*(*filterSize) + 2]= y3; | |
1914 filter[i*(*filterSize) + 3]= y4; | |
1915 // printf("%1.3f %d, %d, %d, %d\n",d , y1, y2, y3, y4); | |
1916 } | |
3344 | 1917 else |
3272 | 1918 { |
1919 for(j=0; j<*filterSize; j++) | |
1920 { | |
1921 double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16); | |
3352 | 1922 double coeff= 1.0 - d; |
3344 | 1923 if(coeff<0) coeff=0; |
3272 | 1924 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc); |
1925 filter[i*(*filterSize) + j]= coeff; | |
1926 xx++; | |
1927 } | |
3344 | 1928 } |
1929 xDstInSrc+= xInc; | |
3272 | 1930 } |
1931 } | |
1932 else // downscale | |
1933 { | |
1934 int xDstInSrc; | |
1935 if(sws_flags==SWS_BICUBIC) *filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW); | |
1936 else *filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW); | |
1937 // printf("%d %d %d\n", *filterSize, srcW, dstW); | |
3344 | 1938 *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1)); |
1939 | |
3272 | 1940 xDstInSrc= xInc - 0x8000; |
1941 for(i=0; i<dstW; i++) | |
1942 { | |
1943 int xx= (int)((double)xDstInSrc/(double)(1<<16) - *filterSize*0.5 + 0.5); | |
1944 int j; | |
3344 | 1945 |
3272 | 1946 filterPos[i]= xx; |
1947 for(j=0; j<*filterSize; j++) | |
1948 { | |
1949 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc; | |
3352 | 1950 double coeff; |
3272 | 1951 if(sws_flags == SWS_BICUBIC) |
3344 | 1952 { |
3272 | 1953 double A= -0.75; |
1954 // d*=2; | |
1955 // Equation is from VirtualDub | |
1956 if(d<1.0) | |
3352 | 1957 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d); |
3272 | 1958 else if(d<2.0) |
3352 | 1959 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d); |
3272 | 1960 else |
3352 | 1961 coeff=0.0; |
3272 | 1962 } |
3344 | 1963 else |
3272 | 1964 { |
3352 | 1965 coeff= 1.0 - d; |
3344 | 1966 if(coeff<0) coeff=0; |
3272 | 1967 } |
3344 | 1968 // if(filterAlign==1) printf("%d %d %d \n", coeff, (int)d, xDstInSrc); |
3272 | 1969 filter[i*(*filterSize) + j]= coeff; |
1970 xx++; | |
1971 } | |
1972 xDstInSrc+= xInc; | |
1973 } | |
1974 } | |
3344 | 1975 |
3272 | 1976 //fix borders |
1977 for(i=0; i<dstW; i++) | |
1978 { | |
1979 int j; | |
1980 if(filterPos[i] < 0) | |
1981 { | |
1982 // Move filter coeffs left to compensate for filterPos | |
1983 for(j=1; j<*filterSize; j++) | |
1984 { | |
1985 int left= MAX(j + filterPos[i], 0); | |
1986 filter[i*(*filterSize) + left] += filter[i*(*filterSize) + j]; | |
1987 filter[i*(*filterSize) + j]=0; | |
1988 } | |
1989 filterPos[i]= 0; | |
1990 } | |
1991 | |
3352 | 1992 if(filterPos[i] + (*filterSize) > srcW) |
3272 | 1993 { |
3352 | 1994 int shift= filterPos[i] + (*filterSize) - srcW; |
3272 | 1995 // Move filter coeffs right to compensate for filterPos |
3352 | 1996 for(j=(*filterSize)-2; j>=0; j--) |
3272 | 1997 { |
3352 | 1998 int right= MIN(j + shift, (*filterSize)-1); |
3272 | 1999 filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j]; |
2000 filter[i*(*filterSize) +j]=0; | |
2001 } | |
3352 | 2002 filterPos[i]= srcW - (*filterSize); |
3272 | 2003 } |
2004 } | |
3344 | 2005 |
2006 //FIXME try to align filterpos if possible / try to shift filterpos to put zeros at the end | |
2007 // and skip these than later | |
2008 | |
3272 | 2009 //Normalize |
2010 for(i=0; i<dstW; i++) | |
2011 { | |
2012 int j; | |
2013 double sum=0; | |
3344 | 2014 double scale= one; |
3272 | 2015 for(j=0; j<*filterSize; j++) |
2016 { | |
2017 sum+= filter[i*(*filterSize) + j]; | |
2018 } | |
2019 scale/= sum; | |
2020 for(j=0; j<*filterSize; j++) | |
2021 { | |
3352 | 2022 dstFilter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale); |
3272 | 2023 } |
2024 } | |
2469 | 2025 } |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
2026 |
3344 | 2027 #ifdef HAVE_MMX2 |
2028 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode) | |
2029 { | |
2030 uint8_t *fragment; | |
2031 int imm8OfPShufW1; | |
2032 int imm8OfPShufW2; | |
2033 int fragmentLength; | |
2034 | |
2035 int xpos, i; | |
2036 | |
2037 // create an optimized horizontal scaling routine | |
2038 | |
2039 //code fragment | |
2040 | |
2041 asm volatile( | |
2042 "jmp 9f \n\t" | |
2043 // Begin | |
2044 "0: \n\t" | |
2045 "movq (%%esi), %%mm0 \n\t" //FIXME Alignment | |
2046 "movq %%mm0, %%mm1 \n\t" | |
2047 "psrlq $8, %%mm0 \n\t" | |
2048 "punpcklbw %%mm7, %%mm1 \n\t" | |
2049 "movq %%mm2, %%mm3 \n\t" | |
2050 "punpcklbw %%mm7, %%mm0 \n\t" | |
2051 "addw %%bx, %%cx \n\t" //2*xalpha += (4*lumXInc)&0xFFFF | |
2052 "pshufw $0xFF, %%mm1, %%mm1 \n\t" | |
2053 "1: \n\t" | |
2054 "adcl %%edx, %%esi \n\t" //xx+= (4*lumXInc)>>16 + carry | |
2055 "pshufw $0xFF, %%mm0, %%mm0 \n\t" | |
2056 "2: \n\t" | |
2057 "psrlw $9, %%mm3 \n\t" | |
2058 "psubw %%mm1, %%mm0 \n\t" | |
2059 "pmullw %%mm3, %%mm0 \n\t" | |
2060 "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF | |
2061 "psllw $7, %%mm1 \n\t" | |
2062 "paddw %%mm1, %%mm0 \n\t" | |
2063 | |
2064 "movq %%mm0, (%%edi, %%eax) \n\t" | |
2065 | |
2066 "addl $8, %%eax \n\t" | |
2067 // End | |
2068 "9: \n\t" | |
2069 // "int $3\n\t" | |
2070 "leal 0b, %0 \n\t" | |
2071 "leal 1b, %1 \n\t" | |
2072 "leal 2b, %2 \n\t" | |
2073 "decl %1 \n\t" | |
2074 "decl %2 \n\t" | |
2075 "subl %0, %1 \n\t" | |
2076 "subl %0, %2 \n\t" | |
2077 "leal 9b, %3 \n\t" | |
2078 "subl %0, %3 \n\t" | |
2079 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2), | |
2080 "=r" (fragmentLength) | |
2081 ); | |
2082 | |
2083 xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers | |
2084 | |
2085 for(i=0; i<dstW/8; i++) | |
2086 { | |
2087 int xx=xpos>>16; | |
2088 | |
2089 if((i&3) == 0) | |
2090 { | |
2091 int a=0; | |
2092 int b=((xpos+xInc)>>16) - xx; | |
2093 int c=((xpos+xInc*2)>>16) - xx; | |
2094 int d=((xpos+xInc*3)>>16) - xx; | |
2095 | |
2096 memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength); | |
2097 | |
2098 funnyCode[fragmentLength*i/4 + imm8OfPShufW1]= | |
2099 funnyCode[fragmentLength*i/4 + imm8OfPShufW2]= | |
2100 a | (b<<2) | (c<<4) | (d<<6); | |
2101 | |
2102 // if we dont need to read 8 bytes than dont :), reduces the chance of | |
2103 // crossing a cache line | |
2104 if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E; | |
2105 | |
2106 funnyCode[fragmentLength*(i+4)/4]= RET; | |
2107 } | |
2108 xpos+=xInc; | |
2109 } | |
2110 /* | |
2111 xpos= 0; //chrXInc/2 - 0x10000; // difference between centers of chrom samples | |
2112 for(i=0; i<dstUVw/8; i++) | |
2113 { | |
2114 int xx=xpos>>16; | |
2115 | |
2116 if((i&3) == 0) | |
2117 { | |
2118 int a=0; | |
2119 int b=((xpos+chrXInc)>>16) - xx; | |
2120 int c=((xpos+chrXInc*2)>>16) - xx; | |
2121 int d=((xpos+chrXInc*3)>>16) - xx; | |
2122 | |
2123 memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength); | |
2124 | |
2125 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]= | |
2126 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]= | |
2127 a | (b<<2) | (c<<4) | (d<<6); | |
2128 | |
2129 // if we dont need to read 8 bytes than dont :), reduces the chance of | |
2130 // crossing a cache line | |
2131 if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E; | |
2132 | |
2133 funnyUVCode[fragmentLength*(i+4)/4]= RET; | |
2134 } | |
2135 xpos+=chrXInc; | |
2136 } | |
2137 */ | |
2138 // funnyCode[0]= RET; | |
2139 } | |
2140 #endif // HAVE_MMX2 | |
2141 | |
3209 | 2142 static void RENAME(SwScale_YV12slice)(unsigned char* srcptr[],int stride[], int srcSliceY , |
2143 int srcSliceH, uint8_t* dstptr[], int dststride, int dstbpp, | |
2144 int srcW, int srcH, int dstW, int dstH){ | |
2216 | 2145 |
2146 | |
3344 | 2147 unsigned int lumXInc= (srcW << 16) / dstW; |
2148 unsigned int lumYInc= (srcH << 16) / dstH; | |
2149 unsigned int chrXInc; | |
2150 unsigned int chrYInc; | |
2151 | |
3209 | 2152 static int dstY; |
2216 | 2153 |
3344 | 2154 // used to detect a size change |
3272 | 2155 static int oldDstW= -1; |
2156 static int oldSrcW= -1; | |
3344 | 2157 static int oldDstH= -1; |
2158 static int oldSrcH= -1; | |
2159 static int oldFlags=-1; | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2160 |
3344 | 2161 static int lastInLumBuf; |
2162 static int lastInChrBuf; | |
2163 | |
2164 int chrDstW, chrDstH; | |
2165 | |
2166 static int lumBufIndex=0; | |
2167 static int chrBufIndex=0; | |
2216 | 2168 |
3344 | 2169 static int firstTime=1; |
2680 | 2170 |
3352 | 2171 const int widthAlign= dstbpp==12 ? 16 : 8; |
2172 const int bytespp= (dstbpp+1)/8; //(12->1, 15&16->2, 24->3, 32->4) | |
2173 const int over= dstbpp==12 ? (((dstW+15)&(~15))) - dststride | |
2174 : (((dstW+7)&(~7)))*bytespp - dststride; | |
2175 if(dststride%widthAlign !=0 ) | |
3344 | 2176 { |
2177 if(firstTime) | |
2178 fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n" | |
3352 | 2179 "SwScaler: ->cannot do aligned memory acesses anymore\n", |
2180 widthAlign); | |
3344 | 2181 } |
2680 | 2182 |
3352 | 2183 if(over>0) |
2184 { | |
2185 if(firstTime) | |
2186 fprintf(stderr, "SwScaler: Warning: output width is not a multiple of 8 (16 for YV12)\n" | |
2187 "SwScaler: and dststride is not large enough to handle %d extra bytes\n" | |
2188 "SwScaler: ->using unoptimized C version for last line(s)\n", | |
2189 over); | |
2190 } | |
2191 | |
2192 | |
2193 | |
3215 | 2194 //printf("%d %d %d %d\n", srcW, srcH, dstW, dstH); |
3344 | 2195 //printf("%d %d %d %d\n", lumXInc, lumYInc, srcSliceY, srcSliceH); |
3215 | 2196 |
2270 | 2197 #ifdef HAVE_MMX2 |
3344 | 2198 canMMX2BeUsed= (lumXInc <= 0x10000 && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0; |
2199 if(!canMMX2BeUsed && lumXInc <= 0x10000 && (srcW&15)==0 && sws_flags==SWS_FAST_BILINEAR) | |
2200 { | |
3352 | 2201 if(firstTime) |
3344 | 2202 fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n"); |
2203 } | |
3352 | 2204 #else |
2205 canMMX2BeUsed=0; // should be 0 anyway but ... | |
3344 | 2206 #endif |
2207 | |
2208 if(firstTime) | |
2209 { | |
3345 | 2210 #if defined (DITHER1XBPP) && defined (HAVE_MMX) |
2211 char *dither= " dithered"; | |
3344 | 2212 #else |
2213 char *dither= ""; | |
2270 | 2214 #endif |
3344 | 2215 if(sws_flags==SWS_FAST_BILINEAR) |
2216 fprintf(stderr, "SwScaler: FAST_BILINEAR scaler "); | |
2217 else if(sws_flags==SWS_BILINEAR) | |
2218 fprintf(stderr, "SwScaler: BILINEAR scaler "); | |
2219 else if(sws_flags==SWS_BICUBIC) | |
2220 fprintf(stderr, "SwScaler: BICUBIC scaler "); | |
2221 else | |
2222 fprintf(stderr, "SwScaler: ehh flags invalid?! "); | |
2223 | |
2224 if(dstbpp==15) | |
3345 | 2225 fprintf(stderr, "with%s BGR15 output ", dither); |
3344 | 2226 else if(dstbpp==16) |
3345 | 2227 fprintf(stderr, "with%s BGR16 output ", dither); |
3344 | 2228 else if(dstbpp==24) |
2229 fprintf(stderr, "with BGR24 output "); | |
2230 else if(dstbpp==32) | |
2231 fprintf(stderr, "with BGR32 output "); | |
2232 else if(dstbpp==12) | |
2233 fprintf(stderr, "with YV12 output "); | |
2234 else | |
2235 fprintf(stderr, "without output "); | |
2236 | |
2237 #ifdef HAVE_MMX2 | |
2238 fprintf(stderr, "using MMX2\n"); | |
2239 #elif defined (HAVE_3DNOW) | |
2240 fprintf(stderr, "using 3DNOW\n"); | |
2241 #elif defined (HAVE_MMX) | |
2242 fprintf(stderr, "using MMX\n"); | |
2243 #elif defined (ARCH_X86) | |
3352 | 2244 fprintf(stderr, "using X86 ASM\n"); |
3344 | 2245 #else |
2246 fprintf(stderr, "using C\n"); | |
2247 #endif | |
2248 } | |
2249 | |
2270 | 2250 |
2279
9b9c3363abbe
horizontal scaling bugs fixed, should be mostly bugfree now
michael
parents:
2274
diff
changeset
|
2251 // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst |
9b9c3363abbe
horizontal scaling bugs fixed, should be mostly bugfree now
michael
parents:
2274
diff
changeset
|
2252 // n-2 is the last chrominance sample available |
3344 | 2253 // this is not perfect, but noone shuld notice the difference, the more correct variant |
2279
9b9c3363abbe
horizontal scaling bugs fixed, should be mostly bugfree now
michael
parents:
2274
diff
changeset
|
2254 // would be like the vertical one, but that would require some special code for the |
9b9c3363abbe
horizontal scaling bugs fixed, should be mostly bugfree now
michael
parents:
2274
diff
changeset
|
2255 // first and last pixel |
3344 | 2256 if(sws_flags==SWS_FAST_BILINEAR) |
2257 { | |
2258 if(canMMX2BeUsed) lumXInc+= 20; | |
3352 | 2259 #ifndef HAVE_MMX //we dont use the x86asm scaler if mmx is available |
3344 | 2260 else lumXInc = ((srcW-2)<<16)/(dstW-2) - 20; |
3352 | 2261 #endif |
3344 | 2262 } |
2279
9b9c3363abbe
horizontal scaling bugs fixed, should be mostly bugfree now
michael
parents:
2274
diff
changeset
|
2263 |
3344 | 2264 if(fullUVIpol && !(dstbpp==12)) chrXInc= lumXInc>>1, chrDstW= dstW; |
3352 | 2265 else chrXInc= lumXInc, chrDstW= (dstW+1)>>1; |
3344 | 2266 |
3352 | 2267 if(dstbpp==12) chrYInc= lumYInc, chrDstH= (dstH+1)>>1; |
3344 | 2268 else chrYInc= lumYInc>>1, chrDstH= dstH; |
2269 | |
2271 | 2270 // force calculation of the horizontal interpolation of the first line |
2271 | |
3209 | 2272 if(srcSliceY ==0){ |
3215 | 2273 // printf("dstW %d, srcw %d, mmx2 %d\n", dstW, srcW, canMMX2BeUsed); |
3344 | 2274 lumBufIndex=0; |
2275 chrBufIndex=0; | |
3209 | 2276 dstY=0; |
2638
d4211422b6cc
right green line bugfix for width not %8==0 (untested -vo vesa doesnt work)
michael
parents:
2585
diff
changeset
|
2277 |
3344 | 2278 //precalculate horizontal scaler filter coefficients |
2279 if(oldDstW!=dstW || oldSrcW!=srcW || oldFlags!=sws_flags) | |
2638
d4211422b6cc
right green line bugfix for width not %8==0 (untested -vo vesa doesnt work)
michael
parents:
2585
diff
changeset
|
2280 { |
3344 | 2281 #ifdef HAVE_MMX |
2282 const int filterAlign=4; | |
2283 #else | |
2284 const int filterAlign=1; | |
2285 #endif | |
2286 oldDstW= dstW; oldSrcW= srcW; oldFlags= sws_flags; | |
2638
d4211422b6cc
right green line bugfix for width not %8==0 (untested -vo vesa doesnt work)
michael
parents:
2585
diff
changeset
|
2287 |
3352 | 2288 RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc, |
2289 srcW , dstW , filterAlign, 1<<14); | |
2290 RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc, | |
2291 (srcW+1)>>1, chrDstW, filterAlign, 1<<14); | |
3344 | 2292 |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2293 #ifdef HAVE_MMX2 |
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2294 // cant downscale !!! |
3344 | 2295 if(canMMX2BeUsed && sws_flags == SWS_FAST_BILINEAR) |
2296 { | |
2297 initMMX2HScaler(dstW , lumXInc, funnyYCode); | |
2298 initMMX2HScaler(chrDstW, chrXInc, funnyUVCode); | |
2299 } | |
2300 #endif | |
2301 } // Init Horizontal stuff | |
2302 | |
2303 if(oldDstH!=dstH || oldSrcH!=srcH || oldFlags!=sws_flags) | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2304 { |
3344 | 2305 int i; |
2306 oldDstH= dstH; oldSrcH= srcH; oldFlags= sws_flags; //FIXME swsflags conflict with x check | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2307 |
3344 | 2308 // deallocate pixbufs |
2309 for(i=0; i<vLumBufSize; i++) free(lumPixBuf[i]); | |
2310 for(i=0; i<vChrBufSize; i++) free(chrPixBuf[i]); | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2311 |
3344 | 2312 RENAME(initFilter)(vLumFilter, vLumFilterPos, &vLumFilterSize, lumYInc, |
2313 srcH , dstH, 1, (1<<12)-4); | |
2314 RENAME(initFilter)(vChrFilter, vChrFilterPos, &vChrFilterSize, chrYInc, | |
3352 | 2315 (srcH+1)>>1, chrDstH, 1, (1<<12)-4); |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
2316 |
3344 | 2317 // Calculate Buffer Sizes so that they wont run out while handling these damn slices |
2318 vLumBufSize= vLumFilterSize; vChrBufSize= vChrFilterSize; | |
2319 for(i=0; i<dstH; i++) | |
2320 { | |
2321 int chrI= i*chrDstH / dstH; | |
2322 int nextSlice= MAX(vLumFilterPos[i ] + vLumFilterSize - 1, | |
2323 ((vChrFilterPos[chrI] + vChrFilterSize - 1)<<1)); | |
2324 nextSlice&= ~1; // Slices start at even boundaries | |
2325 if(vLumFilterPos[i ] + vLumBufSize < nextSlice) | |
2326 vLumBufSize= nextSlice - vLumFilterPos[i ]; | |
2327 if(vChrFilterPos[chrI] + vChrBufSize < (nextSlice>>1)) | |
2328 vChrBufSize= (nextSlice>>1) - vChrFilterPos[chrI]; | |
2329 } | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2330 |
3344 | 2331 // allocate pixbufs (we use dynamic allocation because otherwise we would need to |
2332 // allocate several megabytes to handle all possible cases) | |
2333 for(i=0; i<vLumBufSize; i++) | |
2334 lumPixBuf[i]= lumPixBuf[i+vLumBufSize]= (uint16_t*)memalign(8, 4000); | |
2335 for(i=0; i<vChrBufSize; i++) | |
2336 chrPixBuf[i]= chrPixBuf[i+vChrBufSize]= (uint16_t*)memalign(8, 8000); | |
2337 | |
2338 //try to avoid drawing green stuff between the right end and the stride end | |
2339 for(i=0; i<vLumBufSize; i++) memset(lumPixBuf[i], 0, 4000); | |
2340 for(i=0; i<vChrBufSize; i++) memset(chrPixBuf[i], 64, 8000); | |
2341 | |
3641 | 2342 ASSERT(chrDstH<=dstH) |
2343 ASSERT(vLumFilterSize*dstH*4<16000) | |
2344 ASSERT(vChrFilterSize*chrDstH*4<16000) | |
3344 | 2345 #ifdef HAVE_MMX |
2346 // pack filter data for mmx code | |
2347 for(i=0; i<vLumFilterSize*dstH; i++) | |
2348 lumMmxFilter[4*i]=lumMmxFilter[4*i+1]=lumMmxFilter[4*i+2]=lumMmxFilter[4*i+3]= | |
2349 vLumFilter[i]; | |
2350 for(i=0; i<vChrFilterSize*chrDstH; i++) | |
2351 chrMmxFilter[4*i]=chrMmxFilter[4*i+1]=chrMmxFilter[4*i+2]=chrMmxFilter[4*i+3]= | |
2352 vChrFilter[i]; | |
2353 #endif | |
2354 } | |
2355 | |
3352 | 2356 if(firstTime && verbose) |
2357 { | |
2358 #ifdef HAVE_MMX2 | |
2359 int mmx2=1; | |
2360 #else | |
2361 int mmx2=0; | |
2362 #endif | |
2363 #ifdef HAVE_MMX | |
2364 int mmx=1; | |
2365 #else | |
2366 int mmx=0; | |
2367 #endif | |
2368 | |
2369 #ifdef HAVE_MMX | |
2370 if(canMMX2BeUsed && sws_flags==SWS_FAST_BILINEAR) | |
2371 printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n"); | |
2372 else | |
2373 { | |
2374 if(hLumFilterSize==4) | |
2375 printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n"); | |
2376 else if(hLumFilterSize==8) | |
2377 printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n"); | |
2378 else | |
2379 printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n"); | |
2380 | |
2381 if(hChrFilterSize==4) | |
2382 printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n"); | |
2383 else if(hChrFilterSize==8) | |
2384 printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n"); | |
2385 else | |
2386 printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n"); | |
2387 } | |
2388 #elif defined (ARCH_X86) | |
2389 printf("SwScaler: using X86-Asm scaler for horizontal scaling\n"); | |
2390 #else | |
2391 if(sws_flags==SWS_FAST_BILINEAR) | |
2392 printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n"); | |
2393 else | |
2394 printf("SwScaler: using C scaler for horizontal scaling\n"); | |
2395 #endif | |
2396 | |
2397 if(dstbpp==12) | |
2398 { | |
2399 if(vLumFilterSize==1) | |
2400 printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12)\n", mmx ? "MMX" : "C"); | |
2401 else | |
2402 printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12)\n", mmx ? "MMX" : "C"); | |
2403 } | |
2404 else | |
2405 { | |
2406 if(vLumFilterSize==1 && vChrFilterSize==2) | |
2407 printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n" | |
2408 "SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n", mmx ? "MMX" : "C"); | |
2409 else if(vLumFilterSize==2 && vChrFilterSize==2) | |
2410 printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C"); | |
2411 else | |
2412 printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C"); | |
2413 } | |
2414 | |
2415 if(dstbpp==24) | |
2416 printf("SwScaler: using %s YV12->BGR24 Converter\n", | |
2417 mmx2 ? "MMX2" : (mmx ? "MMX" : "C")); | |
2418 else | |
2419 printf("SwScaler: using %s YV12->BGR%d Converter\n", mmx ? "MMX" : "C", dstbpp); | |
2420 | |
2421 printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH); | |
2422 } | |
2423 | |
3344 | 2424 lastInLumBuf= -1; |
2425 lastInChrBuf= -1; | |
2426 } // if(firstLine) | |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
2427 |
3344 | 2428 for(;dstY < dstH; dstY++){ |
2429 unsigned char *dest =dstptr[0]+dststride*dstY; | |
2430 unsigned char *uDest=dstptr[1]+(dststride>>1)*(dstY>>1); | |
2431 unsigned char *vDest=dstptr[2]+(dststride>>1)*(dstY>>1); | |
2432 const int chrDstY= dstbpp==12 ? (dstY>>1) : dstY; | |
2433 | |
2434 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input | |
2435 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input | |
2436 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input | |
2437 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input | |
2438 | |
2439 if(sws_flags == SWS_FAST_BILINEAR) | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2440 { |
3344 | 2441 //handle holes |
2442 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; | |
2443 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2444 } |
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2445 |
3344 | 2446 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1) |
2447 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1) | |
2216 | 2448 |
3344 | 2449 // Do we have enough lines in this slice to output the dstY line |
2450 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH)>>1)) | |
2469 | 2451 { |
3344 | 2452 //Do horizontal scaling |
2453 while(lastInLumBuf < lastLumSrcY) | |
2454 { | |
2455 uint8_t *src= srcptr[0]+(lastInLumBuf + 1 - srcSliceY)*stride[0]; | |
2456 lumBufIndex++; | |
2457 ASSERT(lumBufIndex < 2*vLumBufSize) | |
2458 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
2459 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
2460 // printf("%d %d\n", lumBufIndex, vLumBufSize); | |
2461 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, src, srcW, lumXInc); | |
2462 lastInLumBuf++; | |
2463 } | |
2464 while(lastInChrBuf < lastChrSrcY) | |
2465 { | |
2466 uint8_t *src1= srcptr[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[1]; | |
2467 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2]; | |
2468 chrBufIndex++; | |
2469 ASSERT(chrBufIndex < 2*vChrBufSize) | |
2470 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1)) | |
2471 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) | |
3352 | 2472 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc); |
3344 | 2473 lastInChrBuf++; |
2474 } | |
2475 //wrap buf index around to stay inside the ring buffer | |
2476 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
2477 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
2469 | 2478 } |
3344 | 2479 else // not enough lines left in this slice -> load the rest in the buffer |
2469 | 2480 { |
3344 | 2481 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n", |
2482 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY, | |
2483 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize, | |
2484 vChrBufSize, vLumBufSize); | |
2485 */ | |
2486 //Do horizontal scaling | |
2487 while(lastInLumBuf+1 < srcSliceY + srcSliceH) | |
2469 | 2488 { |
3344 | 2489 uint8_t *src= srcptr[0]+(lastInLumBuf + 1 - srcSliceY)*stride[0]; |
2490 lumBufIndex++; | |
2491 ASSERT(lumBufIndex < 2*vLumBufSize) | |
2492 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
2493 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
2494 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, src, srcW, lumXInc); | |
2495 lastInLumBuf++; | |
2469 | 2496 } |
3344 | 2497 while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1)) |
2498 { | |
2499 uint8_t *src1= srcptr[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[1]; | |
2500 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2]; | |
2501 chrBufIndex++; | |
2502 ASSERT(chrBufIndex < 2*vChrBufSize) | |
2503 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1)) | |
2504 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) | |
3352 | 2505 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc); |
3344 | 2506 lastInChrBuf++; |
2507 } | |
2508 //wrap buf index around to stay inside the ring buffer | |
2509 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
2510 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
2511 break; //we cant output a dstY line so lets try with the next slice | |
2469 | 2512 } |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
2513 |
2748 | 2514 #ifdef HAVE_MMX |
3344 | 2515 b5Dither= dither8[dstY&1]; |
2516 g6Dither= dither4[dstY&1]; | |
2517 g5Dither= dither8[dstY&1]; | |
2518 r5Dither= dither8[(dstY+1)&1]; | |
2748 | 2519 #endif |
3352 | 2520 if(dstY < dstH-2 || over<=0) |
2521 { | |
3344 | 2522 if(dstbpp==12) //YV12 |
2523 { | |
2524 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
2525 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12 | |
2526 { | |
2527 int16_t *lumBuf = lumPixBuf[0]; | |
2528 int16_t *chrBuf= chrPixBuf[0]; | |
2529 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW); | |
2530 } | |
2531 else //General YV12 | |
2532 { | |
2533 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
2534 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
2535 RENAME(yuv2yuvX)( | |
2536 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2537 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2538 dest, uDest, vDest, dstW, | |
2539 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4); | |
2540 } | |
2541 } | |
2542 else | |
2543 { | |
2544 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
2545 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
2546 | |
2547 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
2548 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
2549 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB | |
2550 { | |
2551 int chrAlpha= vChrFilter[2*dstY+1]; | |
2552 | |
2553 RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), | |
2554 dest, dstW, chrAlpha, dstbpp); | |
2555 } | |
2556 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB | |
2557 { | |
2558 int lumAlpha= vLumFilter[2*dstY+1]; | |
2559 int chrAlpha= vChrFilter[2*dstY+1]; | |
2560 | |
2561 RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), | |
2562 dest, dstW, lumAlpha, chrAlpha, dstbpp); | |
2563 } | |
2564 else //General RGB | |
2565 { | |
2566 RENAME(yuv2rgbX)( | |
2567 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
2568 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2569 dest, dstW, dstbpp, | |
2570 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4); | |
2571 } | |
2572 } | |
3352 | 2573 } |
2574 else // hmm looks like we cant use MMX here without overwriting this arrays tail | |
2575 { | |
2576 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
2577 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
2578 if(dstbpp==12) //YV12 | |
2579 { | |
2580 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
2581 yuv2yuvXinC( | |
2582 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2583 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2584 dest, uDest, vDest, dstW); | |
2585 } | |
2586 else | |
2587 { | |
2588 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
2589 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
2590 yuv2rgbXinC( | |
2591 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
2592 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2593 dest, dstW, dstbpp); | |
2594 } | |
2595 } | |
3344 | 2596 } |
2534
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2597 |
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2598 #ifdef HAVE_MMX |
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2599 __asm __volatile(SFENCE:::"memory"); |
2566 | 2600 __asm __volatile(EMMS:::"memory"); |
2534
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2601 #endif |
3344 | 2602 firstTime=0; |
3641 | 2603 } |