Mercurial > mplayer.hg
annotate postproc/swscale_template.c @ 3345:20065c9b0f09
dithering info wasnt displayed
author | michael |
---|---|
date | Thu, 06 Dec 2001 01:23:23 +0000 |
parents | e87c59969d17 |
children | 64121e8a43f5 |
rev | line source |
---|---|
2216 | 1 |
2 // Software scaling and colorspace conversion routines for MPlayer | |
3 | |
2269 | 4 // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu> |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
5 // current version mostly by Michael Niedermayer (michaelni@gmx.at) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
6 // the parts written by michael are under GNU GPL |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
7 |
2540 | 8 #undef MOVNTQ |
2680 | 9 #undef PAVGB |
3136 | 10 #undef PREFETCH |
11 #undef PREFETCHW | |
12 #undef EMMS | |
13 #undef SFENCE | |
14 | |
15 #ifdef HAVE_3DNOW | |
16 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ | |
17 #define EMMS "femms" | |
18 #else | |
19 #define EMMS "emms" | |
20 #endif | |
21 | |
22 #ifdef HAVE_3DNOW | |
23 #define PREFETCH "prefetch" | |
24 #define PREFETCHW "prefetchw" | |
25 #elif defined ( HAVE_MMX2 ) | |
26 #define PREFETCH "prefetchnta" | |
27 #define PREFETCHW "prefetcht0" | |
28 #else | |
29 #define PREFETCH "/nop" | |
30 #define PREFETCHW "/nop" | |
31 #endif | |
32 | |
33 #ifdef HAVE_MMX2 | |
34 #define SFENCE "sfence" | |
35 #else | |
36 #define SFENCE "/nop" | |
37 #endif | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
38 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
39 #ifdef HAVE_MMX2 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
40 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
41 #elif defined (HAVE_3DNOW) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
42 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
43 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
44 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
45 #ifdef HAVE_MMX2 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
46 #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
47 #else |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
48 #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
49 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
50 |
3344 | 51 |
52 #define YSCALEYUV2YV12X(x) \ | |
53 "xorl %%eax, %%eax \n\t"\ | |
54 "pxor %%mm3, %%mm3 \n\t"\ | |
55 "pxor %%mm4, %%mm4 \n\t"\ | |
56 "movl %0, %%edx \n\t"\ | |
57 ".balign 16 \n\t" /* FIXME Unroll? */\ | |
58 "1: \n\t"\ | |
59 "movl (%1, %%edx, 4), %%esi \n\t"\ | |
60 "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\ | |
61 "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\ | |
62 "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\ | |
63 "pmulhw %%mm0, %%mm2 \n\t"\ | |
64 "pmulhw %%mm0, %%mm5 \n\t"\ | |
65 "paddw %%mm2, %%mm3 \n\t"\ | |
66 "paddw %%mm5, %%mm4 \n\t"\ | |
67 "addl $1, %%edx \n\t"\ | |
68 " jnz 1b \n\t"\ | |
69 "psraw $3, %%mm3 \n\t"\ | |
70 "psraw $3, %%mm4 \n\t"\ | |
71 "packuswb %%mm4, %%mm3 \n\t"\ | |
72 MOVNTQ(%%mm3, (%3, %%eax))\ | |
73 "addl $8, %%eax \n\t"\ | |
74 "cmpl %4, %%eax \n\t"\ | |
75 "pxor %%mm3, %%mm3 \n\t"\ | |
76 "pxor %%mm4, %%mm4 \n\t"\ | |
77 "movl %0, %%edx \n\t"\ | |
78 "jb 1b \n\t" | |
79 | |
80 #define YSCALEYUV2YV121 \ | |
81 "movl %2, %%eax \n\t"\ | |
82 ".balign 16 \n\t" /* FIXME Unroll? */\ | |
83 "1: \n\t"\ | |
84 "movq (%0, %%eax, 2), %%mm0 \n\t"\ | |
85 "movq 8(%0, %%eax, 2), %%mm1 \n\t"\ | |
86 "psraw $7, %%mm0 \n\t"\ | |
87 "psraw $7, %%mm1 \n\t"\ | |
88 "packuswb %%mm1, %%mm0 \n\t"\ | |
89 MOVNTQ(%%mm0, (%1, %%eax))\ | |
90 "addl $8, %%eax \n\t"\ | |
91 "jnc 1b \n\t" | |
92 | |
93 /* | |
94 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
95 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
96 "r" (dest), "m" (dstW), | |
97 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
98 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
99 */ | |
100 #define YSCALEYUV2RGBX \ | |
101 "xorl %%eax, %%eax \n\t"\ | |
102 ".balign 16 \n\t"\ | |
103 "1: \n\t"\ | |
104 "movl %1, %%edx \n\t" /* -chrFilterSize */\ | |
105 "movl %3, %%ebx \n\t" /* chrMmxFilter+lumFilterSize */\ | |
106 "movl %7, %%ecx \n\t" /* chrSrc+lumFilterSize */\ | |
107 "pxor %%mm3, %%mm3 \n\t"\ | |
108 "pxor %%mm4, %%mm4 \n\t"\ | |
109 "2: \n\t"\ | |
110 "movl (%%ecx, %%edx, 4), %%esi \n\t"\ | |
111 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\ | |
112 "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\ | |
113 "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\ | |
114 "pmulhw %%mm0, %%mm2 \n\t"\ | |
115 "pmulhw %%mm0, %%mm5 \n\t"\ | |
116 "paddw %%mm2, %%mm3 \n\t"\ | |
117 "paddw %%mm5, %%mm4 \n\t"\ | |
118 "addl $1, %%edx \n\t"\ | |
119 " jnz 2b \n\t"\ | |
120 \ | |
121 "movl %0, %%edx \n\t" /* -lumFilterSize */\ | |
122 "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\ | |
123 "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\ | |
124 "pxor %%mm1, %%mm1 \n\t"\ | |
125 "pxor %%mm7, %%mm7 \n\t"\ | |
126 "2: \n\t"\ | |
127 "movl (%%ecx, %%edx, 4), %%esi \n\t"\ | |
128 "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\ | |
129 "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\ | |
130 "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\ | |
131 "pmulhw %%mm0, %%mm2 \n\t"\ | |
132 "pmulhw %%mm0, %%mm5 \n\t"\ | |
133 "paddw %%mm2, %%mm1 \n\t"\ | |
134 "paddw %%mm5, %%mm7 \n\t"\ | |
135 "addl $1, %%edx \n\t"\ | |
136 " jnz 2b \n\t"\ | |
137 \ | |
138 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\ | |
139 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\ | |
140 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |
141 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | |
142 "pmulhw ugCoeff, %%mm3 \n\t"\ | |
143 "pmulhw vgCoeff, %%mm4 \n\t"\ | |
144 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | |
145 "pmulhw ubCoeff, %%mm2 \n\t"\ | |
146 "pmulhw vrCoeff, %%mm5 \n\t"\ | |
147 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ | |
148 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\ | |
149 "pmulhw yCoeff, %%mm1 \n\t"\ | |
150 "pmulhw yCoeff, %%mm7 \n\t"\ | |
151 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | |
152 "paddw %%mm3, %%mm4 \n\t"\ | |
153 "movq %%mm2, %%mm0 \n\t"\ | |
154 "movq %%mm5, %%mm6 \n\t"\ | |
155 "movq %%mm4, %%mm3 \n\t"\ | |
156 "punpcklwd %%mm2, %%mm2 \n\t"\ | |
157 "punpcklwd %%mm5, %%mm5 \n\t"\ | |
158 "punpcklwd %%mm4, %%mm4 \n\t"\ | |
159 "paddw %%mm1, %%mm2 \n\t"\ | |
160 "paddw %%mm1, %%mm5 \n\t"\ | |
161 "paddw %%mm1, %%mm4 \n\t"\ | |
162 "punpckhwd %%mm0, %%mm0 \n\t"\ | |
163 "punpckhwd %%mm6, %%mm6 \n\t"\ | |
164 "punpckhwd %%mm3, %%mm3 \n\t"\ | |
165 "paddw %%mm7, %%mm0 \n\t"\ | |
166 "paddw %%mm7, %%mm6 \n\t"\ | |
167 "paddw %%mm7, %%mm3 \n\t"\ | |
168 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | |
169 "packuswb %%mm0, %%mm2 \n\t"\ | |
170 "packuswb %%mm6, %%mm5 \n\t"\ | |
171 "packuswb %%mm3, %%mm4 \n\t"\ | |
172 "pxor %%mm7, %%mm7 \n\t" | |
173 | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
174 #define FULL_YSCALEYUV2RGB \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
175 "pxor %%mm7, %%mm7 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
176 "movd %6, %%mm6 \n\t" /*yalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
177 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
178 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
179 "movd %7, %%mm5 \n\t" /*uvalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
180 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
181 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
182 "xorl %%eax, %%eax \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
183 ".balign 16 \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
184 "1: \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
185 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
186 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
187 "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
188 "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
189 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
190 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
191 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
192 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
193 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
194 "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
195 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
196 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
197 "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
198 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
199 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
200 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
201 "psubw w400, %%mm3 \n\t" /* 8(U-128)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
202 "pmulhw yCoeff, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
203 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
204 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
205 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
206 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
207 "pmulhw ubCoeff, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
208 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
209 "pmulhw ugCoeff, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
210 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
211 "psubw w400, %%mm0 \n\t" /* (V-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
212 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
213 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
214 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
215 "pmulhw vrCoeff, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
216 "pmulhw vgCoeff, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
217 "paddw %%mm1, %%mm3 \n\t" /* B*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
218 "paddw %%mm1, %%mm0 \n\t" /* R*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
219 "packuswb %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
220 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
221 "packuswb %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
222 "paddw %%mm4, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
223 "paddw %%mm2, %%mm1 \n\t" /* G*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
224 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
225 "packuswb %%mm1, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
226 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
227 #define YSCALEYUV2RGB \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
228 "movd %6, %%mm6 \n\t" /*yalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
229 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
230 "punpcklwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
231 "movq %%mm6, asm_yalpha1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
232 "movd %7, %%mm5 \n\t" /*uvalpha1*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
233 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
234 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
235 "movq %%mm5, asm_uvalpha1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
236 "xorl %%eax, %%eax \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
237 ".balign 16 \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
238 "1: \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
239 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
240 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
241 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
242 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
243 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
244 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
245 "movq asm_uvalpha1, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
246 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
247 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
248 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
249 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
250 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
251 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
252 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
253 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
254 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
255 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
256 "pmulhw ugCoeff, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
257 "pmulhw vgCoeff, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
258 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
259 "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
260 "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
261 "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
262 "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
263 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
264 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
265 "pmulhw asm_yalpha1, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
266 "pmulhw asm_yalpha1, %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
267 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
268 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
269 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
270 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
271 "pmulhw ubCoeff, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
272 "pmulhw vrCoeff, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
273 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
274 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
275 "pmulhw yCoeff, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
276 "pmulhw yCoeff, %%mm7 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
277 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
278 "paddw %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
279 "movq %%mm2, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
280 "movq %%mm5, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
281 "movq %%mm4, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
282 "punpcklwd %%mm2, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
283 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
284 "punpcklwd %%mm4, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
285 "paddw %%mm1, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
286 "paddw %%mm1, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
287 "paddw %%mm1, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
288 "punpckhwd %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
289 "punpckhwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
290 "punpckhwd %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
291 "paddw %%mm7, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
292 "paddw %%mm7, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
293 "paddw %%mm7, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
294 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
295 "packuswb %%mm0, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
296 "packuswb %%mm6, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
297 "packuswb %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
298 "pxor %%mm7, %%mm7 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
299 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
300 #define YSCALEYUV2RGB1 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
301 "xorl %%eax, %%eax \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
302 ".balign 16 \n\t"\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
303 "1: \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
304 "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
305 "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
306 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
307 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
308 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
309 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
310 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
311 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
312 "pmulhw ugCoeff, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
313 "pmulhw vgCoeff, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
314 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
315 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
316 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
317 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
318 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
319 "pmulhw ubCoeff, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
320 "pmulhw vrCoeff, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
321 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
322 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
323 "pmulhw yCoeff, %%mm1 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
324 "pmulhw yCoeff, %%mm7 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
325 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
326 "paddw %%mm3, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
327 "movq %%mm2, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
328 "movq %%mm5, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
329 "movq %%mm4, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
330 "punpcklwd %%mm2, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
331 "punpcklwd %%mm5, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
332 "punpcklwd %%mm4, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
333 "paddw %%mm1, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
334 "paddw %%mm1, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
335 "paddw %%mm1, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
336 "punpckhwd %%mm0, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
337 "punpckhwd %%mm6, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
338 "punpckhwd %%mm3, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
339 "paddw %%mm7, %%mm0 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
340 "paddw %%mm7, %%mm6 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
341 "paddw %%mm7, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
342 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
343 "packuswb %%mm0, %%mm2 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
344 "packuswb %%mm6, %%mm5 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
345 "packuswb %%mm3, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
346 "pxor %%mm7, %%mm7 \n\t" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
347 |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
348 // do vertical chrominance interpolation |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
349 #define YSCALEYUV2RGB1b \ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
350 "xorl %%eax, %%eax \n\t"\ |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
351 ".balign 16 \n\t"\ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
352 "1: \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
353 "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
354 "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
355 "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
356 "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ |
2576 | 357 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
358 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | |
3344 | 359 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ |
360 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
361 "psubw w400, %%mm3 \n\t" /* (U-128)8*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
362 "psubw w400, %%mm4 \n\t" /* (V-128)8*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
363 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
364 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
365 "pmulhw ugCoeff, %%mm3 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
366 "pmulhw vgCoeff, %%mm4 \n\t"\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
367 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
368 "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
369 "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
370 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
371 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
372 "pmulhw ubCoeff, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
373 "pmulhw vrCoeff, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
374 "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
375 "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
376 "pmulhw yCoeff, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
377 "pmulhw yCoeff, %%mm7 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
378 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
379 "paddw %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
380 "movq %%mm2, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
381 "movq %%mm5, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
382 "movq %%mm4, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
383 "punpcklwd %%mm2, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
384 "punpcklwd %%mm5, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
385 "punpcklwd %%mm4, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
386 "paddw %%mm1, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
387 "paddw %%mm1, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
388 "paddw %%mm1, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
389 "punpckhwd %%mm0, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
390 "punpckhwd %%mm6, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
391 "punpckhwd %%mm3, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
392 "paddw %%mm7, %%mm0 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
393 "paddw %%mm7, %%mm6 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
394 "paddw %%mm7, %%mm3 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
395 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
396 "packuswb %%mm0, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
397 "packuswb %%mm6, %%mm5 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
398 "packuswb %%mm3, %%mm4 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
399 "pxor %%mm7, %%mm7 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
400 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
401 #define WRITEBGR32 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
402 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
403 "movq %%mm2, %%mm1 \n\t" /* B */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
404 "movq %%mm5, %%mm6 \n\t" /* R */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
405 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
406 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
407 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
408 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
409 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
410 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
411 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
412 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
413 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
414 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
415 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
416 MOVNTQ(%%mm0, (%4, %%eax, 4))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
417 MOVNTQ(%%mm2, 8(%4, %%eax, 4))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
418 MOVNTQ(%%mm1, 16(%4, %%eax, 4))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
419 MOVNTQ(%%mm3, 24(%4, %%eax, 4))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
420 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
421 "addl $8, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
422 "cmpl %5, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
423 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
424 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
425 #define WRITEBGR16 \ |
2669 | 426 "pand bF8, %%mm2 \n\t" /* B */\ |
427 "pand bFC, %%mm4 \n\t" /* G */\ | |
428 "pand bF8, %%mm5 \n\t" /* R */\ | |
429 "psrlq $3, %%mm2 \n\t"\ | |
430 \ | |
431 "movq %%mm2, %%mm1 \n\t"\ | |
432 "movq %%mm4, %%mm3 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
433 \ |
2669 | 434 "punpcklbw %%mm7, %%mm3 \n\t"\ |
435 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
436 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
437 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
438 \ |
2669 | 439 "psllq $3, %%mm3 \n\t"\ |
440 "psllq $3, %%mm4 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
441 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
442 "por %%mm3, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
443 "por %%mm4, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
444 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
445 MOVNTQ(%%mm2, (%4, %%eax, 2))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
446 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
447 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
448 "addl $8, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
449 "cmpl %5, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
450 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
451 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
452 #define WRITEBGR15 \ |
2669 | 453 "pand bF8, %%mm2 \n\t" /* B */\ |
454 "pand bF8, %%mm4 \n\t" /* G */\ | |
455 "pand bF8, %%mm5 \n\t" /* R */\ | |
456 "psrlq $3, %%mm2 \n\t"\ | |
457 "psrlq $1, %%mm5 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
458 \ |
2669 | 459 "movq %%mm2, %%mm1 \n\t"\ |
460 "movq %%mm4, %%mm3 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
461 \ |
2669 | 462 "punpcklbw %%mm7, %%mm3 \n\t"\ |
463 "punpcklbw %%mm5, %%mm2 \n\t"\ | |
464 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
465 "punpckhbw %%mm5, %%mm1 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
466 \ |
2669 | 467 "psllq $2, %%mm3 \n\t"\ |
468 "psllq $2, %%mm4 \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
469 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
470 "por %%mm3, %%mm2 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
471 "por %%mm4, %%mm1 \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
472 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
473 MOVNTQ(%%mm2, (%4, %%eax, 2))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
474 MOVNTQ(%%mm1, 8(%4, %%eax, 2))\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
475 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
476 "addl $8, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
477 "cmpl %5, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
478 " jb 1b \n\t" |
2669 | 479 |
2730 | 480 #define WRITEBGR24OLD \ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
481 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
482 "movq %%mm2, %%mm1 \n\t" /* B */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
483 "movq %%mm5, %%mm6 \n\t" /* R */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
484 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
485 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
486 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
487 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
488 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
489 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ |
2326 | 490 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ |
491 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
492 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
493 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
494 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
495 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
496 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
497 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
498 "pand bm11111000, %%mm0 \n\t" /* 00RGB000 0.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
499 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
500 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
501 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
502 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
503 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
504 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
505 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
506 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
507 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
508 "pand bm00001111, %%mm2 \n\t" /* 0000RGBR 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
509 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
510 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
511 "pand bm00000111, %%mm4 \n\t" /* 00000RGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
512 "pand bm11111000, %%mm1 \n\t" /* 00RGB000 2.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
513 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
514 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
515 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
516 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
517 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
518 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
519 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
520 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
521 "pand bm00000111, %%mm5 \n\t" /* 00000RGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
522 "pand bm11111000, %%mm3 \n\t" /* 00RGB000 3.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
523 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
524 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
525 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
526 \ |
2728 | 527 MOVNTQ(%%mm0, (%%ebx))\ |
528 MOVNTQ(%%mm2, 8(%%ebx))\ | |
529 MOVNTQ(%%mm3, 16(%%ebx))\ | |
530 "addl $24, %%ebx \n\t"\ | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
531 \ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
532 "addl $8, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
533 "cmpl %5, %%eax \n\t"\ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
534 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
535 |
2730 | 536 #define WRITEBGR24MMX \ |
537 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
538 "movq %%mm2, %%mm1 \n\t" /* B */\ | |
539 "movq %%mm5, %%mm6 \n\t" /* R */\ | |
540 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | |
541 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | |
542 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | |
543 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | |
544 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | |
545 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | |
546 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | |
547 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | |
548 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | |
549 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | |
550 \ | |
551 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | |
552 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ | |
553 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ | |
554 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ | |
555 \ | |
556 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ | |
557 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ | |
558 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ | |
559 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ | |
560 \ | |
561 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ | |
562 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ | |
563 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ | |
564 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ | |
565 \ | |
566 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ | |
567 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ | |
568 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ | |
569 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | |
570 MOVNTQ(%%mm0, (%%ebx))\ | |
571 \ | |
572 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ | |
573 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ | |
574 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ | |
575 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ | |
576 MOVNTQ(%%mm6, 8(%%ebx))\ | |
577 \ | |
578 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ | |
579 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ | |
580 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ | |
581 MOVNTQ(%%mm5, 16(%%ebx))\ | |
582 \ | |
583 "addl $24, %%ebx \n\t"\ | |
584 \ | |
585 "addl $8, %%eax \n\t"\ | |
586 "cmpl %5, %%eax \n\t"\ | |
587 " jb 1b \n\t" | |
588 | |
589 #define WRITEBGR24MMX2 \ | |
590 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | |
591 "movq M24A, %%mm0 \n\t"\ | |
592 "movq M24C, %%mm7 \n\t"\ | |
593 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ | |
594 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ | |
595 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ | |
596 \ | |
597 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ | |
598 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ | |
599 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ | |
600 \ | |
601 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ | |
602 "por %%mm1, %%mm6 \n\t"\ | |
603 "por %%mm3, %%mm6 \n\t"\ | |
604 MOVNTQ(%%mm6, (%%ebx))\ | |
605 \ | |
606 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ | |
607 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ | |
608 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ | |
609 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ | |
610 \ | |
611 "pand M24B, %%mm1 \n\t" /* B5 B4 B3 */\ | |
612 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ | |
613 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ | |
614 \ | |
615 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ | |
616 "por %%mm3, %%mm6 \n\t"\ | |
617 MOVNTQ(%%mm6, 8(%%ebx))\ | |
618 \ | |
619 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ | |
620 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ | |
621 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ | |
622 \ | |
623 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ | |
624 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ | |
625 "pand M24B, %%mm6 \n\t" /* R7 R6 R5 */\ | |
626 \ | |
627 "por %%mm1, %%mm3 \n\t"\ | |
628 "por %%mm3, %%mm6 \n\t"\ | |
629 MOVNTQ(%%mm6, 16(%%ebx))\ | |
630 \ | |
631 "addl $24, %%ebx \n\t"\ | |
632 \ | |
633 "addl $8, %%eax \n\t"\ | |
634 "cmpl %5, %%eax \n\t"\ | |
635 " jb 1b \n\t" | |
636 | |
637 #ifdef HAVE_MMX2 | |
3126 | 638 #undef WRITEBGR24 |
2730 | 639 #define WRITEBGR24 WRITEBGR24MMX2 |
640 #else | |
3126 | 641 #undef WRITEBGR24 |
2730 | 642 #define WRITEBGR24 WRITEBGR24MMX |
643 #endif | |
644 | |
3344 | 645 static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
646 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
647 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, | |
648 int16_t * lumMmxFilter, int16_t * chrMmxFilter) | |
2519 | 649 { |
3344 | 650 #ifdef HAVE_MMX |
651 if(uDest != NULL) | |
652 { | |
653 asm volatile( | |
654 YSCALEYUV2YV12X(0) | |
655 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize), | |
656 "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1) | |
657 : "%eax", "%edx", "%esi" | |
658 ); | |
2519 | 659 |
3344 | 660 asm volatile( |
661 YSCALEYUV2YV12X(4096) | |
662 :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize), | |
663 "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1) | |
664 : "%eax", "%edx", "%esi" | |
665 ); | |
666 } | |
2521 | 667 |
3344 | 668 asm volatile( |
669 YSCALEYUV2YV12X(0) | |
670 :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize), | |
671 "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW) | |
672 : "%eax", "%edx", "%esi" | |
673 ); | |
674 #else | |
675 //FIXME Optimize (just quickly writen not opti..) | |
676 int i; | |
677 for(i=0; i<dstW; i++) | |
2519 | 678 { |
3344 | 679 int val=0; |
680 int j; | |
681 for(j=0; j<lumFilterSize; j++) | |
682 val += lumSrc[j][i] * lumFilter[j]; | |
683 | |
684 dest[i]= MIN(MAX(val>>19, 0), 255); | |
2519 | 685 } |
686 | |
3344 | 687 if(uDest != NULL) |
3209 | 688 for(i=0; i<(dstW>>1); i++) |
2519 | 689 { |
3344 | 690 int u=0; |
691 int v=0; | |
692 int j; | |
693 for(j=0; j<lumFilterSize; j++) | |
694 { | |
695 u += chrSrc[j][i] * chrFilter[j]; | |
696 v += chrSrc[j][i + 2048] * chrFilter[j]; | |
697 } | |
698 | |
699 uDest[i]= MIN(MAX(u>>19, 0), 255); | |
700 vDest[i]= MIN(MAX(v>>19, 0), 255); | |
2519 | 701 } |
3344 | 702 #endif |
703 } | |
704 | |
705 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc, | |
706 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW) | |
707 { | |
708 #ifdef HAVE_MMX | |
709 if(uDest != NULL) | |
710 { | |
711 asm volatile( | |
712 YSCALEYUV2YV121 | |
713 :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)), | |
714 "g" (-(dstW>>1)) | |
715 : "%eax" | |
716 ); | |
717 | |
718 asm volatile( | |
719 YSCALEYUV2YV121 | |
720 :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)), | |
721 "g" (-(dstW>>1)) | |
722 : "%eax" | |
723 ); | |
2519 | 724 } |
3344 | 725 |
726 asm volatile( | |
727 YSCALEYUV2YV121 | |
728 :: "r" (lumSrc + dstW), "r" (dest + dstW), | |
729 "g" (-dstW) | |
730 : "%eax" | |
731 ); | |
732 #else | |
733 //FIXME Optimize (just quickly writen not opti..) | |
734 //FIXME replace MINMAX with LUTs | |
735 int i; | |
736 for(i=0; i<dstW; i++) | |
737 { | |
738 int val= lumSrc[i]>>7; | |
739 | |
740 dest[i]= MIN(MAX(val>>19, 0), 255); | |
741 } | |
742 | |
743 if(uDest != NULL) | |
744 for(i=0; i<(dstW>>1); i++) | |
745 { | |
746 int u=chrSrc[i]>>7; | |
747 int v=chrSrc[i + 2048]>>7; | |
748 | |
749 uDest[i]= MIN(MAX(u>>19, 0), 255); | |
750 vDest[i]= MIN(MAX(v>>19, 0), 255); | |
751 } | |
752 #endif | |
2519 | 753 } |
754 | |
3344 | 755 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
756 /** |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
757 * vertical scale YV12 to RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
758 */ |
3344 | 759 static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, |
760 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | |
761 uint8_t *dest, int dstW, int dstbpp, int16_t * lumMmxFilter, int16_t * chrMmxFilter) | |
762 { | |
763 if(fullUVIpol) | |
764 { | |
765 //FIXME | |
766 }//FULL_UV_IPOL | |
767 else | |
768 { | |
769 #ifdef HAVE_MMX | |
770 if(dstbpp == 32) //FIXME untested | |
771 { | |
772 asm volatile( | |
773 YSCALEYUV2RGBX | |
774 WRITEBGR32 | |
775 | |
776 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
777 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
778 "r" (dest), "m" (dstW), | |
779 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
780 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
781 ); | |
782 } | |
783 else if(dstbpp==24) //FIXME untested | |
784 { | |
785 asm volatile( | |
786 YSCALEYUV2RGBX | |
787 "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize | |
788 "addl %4, %%ebx \n\t" | |
789 WRITEBGR24 | |
790 | |
791 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
792 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
793 "r" (dest), "m" (dstW), | |
794 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
795 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
796 ); | |
797 } | |
798 else if(dstbpp==15) | |
799 { | |
800 asm volatile( | |
801 YSCALEYUV2RGBX | |
802 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
803 #ifdef DITHER1XBPP | |
804 "paddusb b5Dither, %%mm2 \n\t" | |
805 "paddusb g5Dither, %%mm4 \n\t" | |
806 "paddusb r5Dither, %%mm5 \n\t" | |
807 #endif | |
808 | |
809 WRITEBGR15 | |
810 | |
811 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
812 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
813 "r" (dest), "m" (dstW), | |
814 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
815 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
816 ); | |
817 } | |
818 else if(dstbpp==16) | |
819 { | |
820 asm volatile( | |
821 YSCALEYUV2RGBX | |
822 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |
823 #ifdef DITHER1XBPP | |
824 "paddusb b5Dither, %%mm2 \n\t" | |
825 "paddusb g6Dither, %%mm4 \n\t" | |
826 "paddusb r5Dither, %%mm5 \n\t" | |
827 #endif | |
828 | |
829 WRITEBGR16 | |
830 | |
831 :: "m" (-lumFilterSize), "m" (-chrFilterSize), | |
832 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | |
833 "r" (dest), "m" (dstW), | |
834 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | |
835 : "%eax", "%ebx", "%ecx", "%edx", "%esi" | |
836 ); | |
837 } | |
838 #else | |
839 if(dstbpp==32) | |
840 { | |
841 int i; | |
842 for(i=0; i<(dstW>>1); i++){ | |
843 int j; | |
844 int Y1=0; | |
845 int Y2=0; | |
846 int U=0; | |
847 int V=0; | |
848 int Cb, Cr, Cg; | |
849 for(j=0; j<lumFilterSize; j++) | |
850 { | |
851 Y1 += lumSrc[j][2*i] * lumFilter[j]; | |
852 Y2 += lumSrc[j][2*i+1] * lumFilter[j]; | |
853 } | |
854 for(j=0; j<chrFilterSize; j++) | |
855 { | |
856 U += chrSrc[j][i] * chrFilter[j]; | |
857 V += chrSrc[j][i+2048] * chrFilter[j]; | |
858 } | |
859 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ]; | |
860 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ]; | |
861 U >>= 19; | |
862 V >>= 19; | |
863 | |
864 Cb= clip_yuvtab_40cf[U+ 256]; | |
865 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256]; | |
866 Cr= clip_yuvtab_3343[V+ 256]; | |
867 | |
868 dest[8*i+0]=clip_table[((Y1 + Cb) >>13)]; | |
869 dest[8*i+1]=clip_table[((Y1 + Cg) >>13)]; | |
870 dest[8*i+2]=clip_table[((Y1 + Cr) >>13)]; | |
871 | |
872 dest[8*i+4]=clip_table[((Y2 + Cb) >>13)]; | |
873 dest[8*i+5]=clip_table[((Y2 + Cg) >>13)]; | |
874 dest[8*i+6]=clip_table[((Y2 + Cr) >>13)]; | |
875 } | |
876 } | |
877 else if(dstbpp==24) | |
878 { | |
879 int i; | |
880 for(i=0; i<(dstW>>1); i++){ | |
881 int j; | |
882 int Y1=0; | |
883 int Y2=0; | |
884 int U=0; | |
885 int V=0; | |
886 int Cb, Cr, Cg; | |
887 for(j=0; j<lumFilterSize; j++) | |
888 { | |
889 Y1 += lumSrc[j][2*i] * lumFilter[j]; | |
890 Y2 += lumSrc[j][2*i+1] * lumFilter[j]; | |
891 } | |
892 for(j=0; j<chrFilterSize; j++) | |
893 { | |
894 U += chrSrc[j][i] * chrFilter[j]; | |
895 V += chrSrc[j][i+2048] * chrFilter[j]; | |
896 } | |
897 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ]; | |
898 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ]; | |
899 U >>= 19; | |
900 V >>= 19; | |
901 | |
902 Cb= clip_yuvtab_40cf[U+ 256]; | |
903 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256]; | |
904 Cr= clip_yuvtab_3343[V+ 256]; | |
905 | |
906 dest[0]=clip_table[((Y1 + Cb) >>13)]; | |
907 dest[1]=clip_table[((Y1 + Cg) >>13)]; | |
908 dest[2]=clip_table[((Y1 + Cr) >>13)]; | |
909 | |
910 dest[3]=clip_table[((Y2 + Cb) >>13)]; | |
911 dest[4]=clip_table[((Y2 + Cg) >>13)]; | |
912 dest[5]=clip_table[((Y2 + Cr) >>13)]; | |
913 dest+=6; | |
914 } | |
915 } | |
916 else if(dstbpp==16) | |
917 { | |
918 int i; | |
919 for(i=0; i<(dstW>>1); i++){ | |
920 int j; | |
921 int Y1=0; | |
922 int Y2=0; | |
923 int U=0; | |
924 int V=0; | |
925 int Cb, Cr, Cg; | |
926 for(j=0; j<lumFilterSize; j++) | |
927 { | |
928 Y1 += lumSrc[j][2*i] * lumFilter[j]; | |
929 Y2 += lumSrc[j][2*i+1] * lumFilter[j]; | |
930 } | |
931 for(j=0; j<chrFilterSize; j++) | |
932 { | |
933 U += chrSrc[j][i] * chrFilter[j]; | |
934 V += chrSrc[j][i+2048] * chrFilter[j]; | |
935 } | |
936 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ]; | |
937 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ]; | |
938 U >>= 19; | |
939 V >>= 19; | |
940 | |
941 Cb= clip_yuvtab_40cf[U+ 256]; | |
942 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256]; | |
943 Cr= clip_yuvtab_3343[V+ 256]; | |
944 | |
945 ((uint16_t*)dest)[2*i] = | |
946 clip_table16b[(Y1 + Cb) >>13] | | |
947 clip_table16g[(Y1 + Cg) >>13] | | |
948 clip_table16r[(Y1 + Cr) >>13]; | |
949 | |
950 ((uint16_t*)dest)[2*i+1] = | |
951 clip_table16b[(Y2 + Cb) >>13] | | |
952 clip_table16g[(Y2 + Cg) >>13] | | |
953 clip_table16r[(Y2 + Cr) >>13]; | |
954 } | |
955 } | |
956 else if(dstbpp==15) | |
957 { | |
958 int i; | |
959 for(i=0; i<(dstW>>1); i++){ | |
960 int j; | |
961 int Y1=0; | |
962 int Y2=0; | |
963 int U=0; | |
964 int V=0; | |
965 int Cb, Cr, Cg; | |
966 for(j=0; j<lumFilterSize; j++) | |
967 { | |
968 Y1 += lumSrc[j][2*i] * lumFilter[j]; | |
969 Y2 += lumSrc[j][2*i+1] * lumFilter[j]; | |
970 } | |
971 for(j=0; j<chrFilterSize; j++) | |
972 { | |
973 U += chrSrc[j][i] * chrFilter[j]; | |
974 V += chrSrc[j][i+2048] * chrFilter[j]; | |
975 } | |
976 Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ]; | |
977 Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ]; | |
978 U >>= 19; | |
979 V >>= 19; | |
980 | |
981 Cb= clip_yuvtab_40cf[U+ 256]; | |
982 Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256]; | |
983 Cr= clip_yuvtab_3343[V+ 256]; | |
984 | |
985 ((uint16_t*)dest)[2*i] = | |
986 clip_table15b[(Y1 + Cb) >>13] | | |
987 clip_table15g[(Y1 + Cg) >>13] | | |
988 clip_table15r[(Y1 + Cr) >>13]; | |
989 | |
990 ((uint16_t*)dest)[2*i+1] = | |
991 clip_table15b[(Y2 + Cb) >>13] | | |
992 clip_table15g[(Y2 + Cg) >>13] | | |
993 clip_table15r[(Y2 + Cr) >>13]; | |
994 } | |
995 } | |
996 #endif | |
997 } //!FULL_UV_IPOL | |
998 } | |
999 | |
1000 | |
1001 /** | |
1002 * vertical bilinear scale YV12 to RGB | |
1003 */ | |
1004 static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, | |
3209 | 1005 uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstbpp) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1006 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1007 int yalpha1=yalpha^4095; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1008 int uvalpha1=uvalpha^4095; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1009 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1010 if(fullUVIpol) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1011 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1012 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1013 #ifdef HAVE_MMX |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1014 if(dstbpp == 32) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1015 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1016 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1017 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1018 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1019 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1020 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1021 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1022 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1023 "movq %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1024 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1025 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1026 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1027 MOVNTQ(%%mm3, (%4, %%eax, 4)) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1028 MOVNTQ(%%mm1, 8(%4, %%eax, 4)) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1029 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1030 "addl $4, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1031 "cmpl %5, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1032 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1033 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1034 |
3209 | 1035 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1036 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1037 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1038 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1039 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1040 else if(dstbpp==24) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1041 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1042 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1043 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1044 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1045 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1046 // lsb ... msb |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1047 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1048 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1049 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1050 "movq %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1051 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1052 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1053 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1054 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1055 "psrlq $8, %%mm3 \n\t" // GR0BGR00 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1056 "pand bm00000111, %%mm2 \n\t" // BGR00000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1057 "pand bm11111000, %%mm3 \n\t" // 000BGR00 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1058 "por %%mm2, %%mm3 \n\t" // BGRBGR00 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1059 "movq %%mm1, %%mm2 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1060 "psllq $48, %%mm1 \n\t" // 000000BG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1061 "por %%mm1, %%mm3 \n\t" // BGRBGRBG |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1062 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1063 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1064 "psrld $16, %%mm2 \n\t" // R000R000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1065 "psrlq $24, %%mm1 \n\t" // 0BGR0000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1066 "por %%mm2, %%mm1 \n\t" // RBGRR000 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1067 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1068 "movl %4, %%ebx \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1069 "addl %%eax, %%ebx \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1070 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1071 #ifdef HAVE_MMX2 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1072 //FIXME Alignment |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1073 "movntq %%mm3, (%%ebx, %%eax, 2)\n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1074 "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1075 #else |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1076 "movd %%mm3, (%%ebx, %%eax, 2) \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1077 "psrlq $32, %%mm3 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1078 "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1079 "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1080 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1081 "addl $4, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1082 "cmpl %5, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1083 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1084 |
3209 | 1085 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1086 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1087 : "%eax", "%ebx" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1088 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1089 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1090 else if(dstbpp==15) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1091 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1092 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1093 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1094 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1095 #ifdef DITHER1XBPP |
2748 | 1096 "paddusb g5Dither, %%mm1 \n\t" |
1097 "paddusb r5Dither, %%mm0 \n\t" | |
1098 "paddusb b5Dither, %%mm3 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1099 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1100 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1101 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1102 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1103 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1104 "psrlw $3, %%mm3 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1105 "psllw $2, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1106 "psllw $7, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1107 "pand g15Mask, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1108 "pand r15Mask, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1109 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1110 "por %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1111 "por %%mm1, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1112 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1113 MOVNTQ(%%mm0, (%4, %%eax, 2)) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1114 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1115 "addl $4, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1116 "cmpl %5, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1117 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1118 |
3209 | 1119 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1120 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1121 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1122 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1123 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1124 else if(dstbpp==16) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1125 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1126 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1127 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1128 FULL_YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1129 #ifdef DITHER1XBPP |
2748 | 1130 "paddusb g6Dither, %%mm1 \n\t" |
1131 "paddusb r5Dither, %%mm0 \n\t" | |
1132 "paddusb b5Dither, %%mm3 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1133 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1134 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1135 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1136 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1137 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1138 "psrlw $3, %%mm3 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1139 "psllw $3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1140 "psllw $8, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1141 "pand g16Mask, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1142 "pand r16Mask, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1143 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1144 "por %%mm3, %%mm1 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1145 "por %%mm1, %%mm0 \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1146 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1147 MOVNTQ(%%mm0, (%4, %%eax, 2)) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1148 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1149 "addl $4, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1150 "cmpl %5, %%eax \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1151 " jb 1b \n\t" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1152 |
3209 | 1153 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1154 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1155 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1156 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1157 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1158 #else |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1159 if(dstbpp==32 || dstbpp==24) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1160 { |
2671 | 1161 int i; |
3209 | 1162 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1163 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1164 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1165 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1166 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
2503 | 1167 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; |
1168 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | |
1169 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1170 dest+=dstbpp>>3; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1171 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1172 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1173 else if(dstbpp==16) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1174 { |
2671 | 1175 int i; |
3209 | 1176 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1177 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1178 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1179 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1180 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1181 |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1182 ((uint16_t*)dest)[i] = |
2584 | 1183 clip_table16b[(Y + yuvtab_40cf[U]) >>13] | |
1184 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1185 clip_table16r[(Y + yuvtab_3343[V]) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1186 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1187 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1188 else if(dstbpp==15) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1189 { |
2671 | 1190 int i; |
3209 | 1191 for(i=0;i<dstW;i++){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1192 // vertical linear interpolation && yuv2rgb in a single step: |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1193 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1194 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1195 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1196 |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1197 ((uint16_t*)dest)[i] = |
2584 | 1198 clip_table15b[(Y + yuvtab_40cf[U]) >>13] | |
1199 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | |
1200 clip_table15r[(Y + yuvtab_3343[V]) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1201 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1202 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1203 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1204 }//FULL_UV_IPOL |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1205 else |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1206 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1207 #ifdef HAVE_MMX |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1208 if(dstbpp == 32) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1209 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1210 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1211 YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1212 WRITEBGR32 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1213 |
3209 | 1214 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1215 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1216 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1217 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1218 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1219 else if(dstbpp==24) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1220 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1221 asm volatile( |
2728 | 1222 "movl %4, %%ebx \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1223 YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1224 WRITEBGR24 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1225 |
3209 | 1226 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1227 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1228 : "%eax", "%ebx" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1229 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1230 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1231 else if(dstbpp==15) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1232 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1233 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1234 YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1235 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1236 #ifdef DITHER1XBPP |
2748 | 1237 "paddusb b5Dither, %%mm2 \n\t" |
1238 "paddusb g5Dither, %%mm4 \n\t" | |
1239 "paddusb r5Dither, %%mm5 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1240 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1241 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1242 WRITEBGR15 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1243 |
3209 | 1244 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1245 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1246 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1247 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1248 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1249 else if(dstbpp==16) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1250 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1251 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1252 YSCALEYUV2RGB |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1253 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1254 #ifdef DITHER1XBPP |
2748 | 1255 "paddusb b5Dither, %%mm2 \n\t" |
1256 "paddusb g6Dither, %%mm4 \n\t" | |
1257 "paddusb r5Dither, %%mm5 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1258 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1259 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1260 WRITEBGR16 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1261 |
3209 | 1262 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1263 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1264 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1265 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1266 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1267 #else |
2575 | 1268 if(dstbpp==32) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1269 { |
2671 | 1270 int i; |
3209 | 1271 for(i=0; i<dstW-1; i+=2){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1272 // vertical linear interpolation && yuv2rgb in a single step: |
2575 | 1273 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
1274 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)]; | |
2585 | 1275 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1276 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2575 | 1277 |
1278 int Cb= yuvtab_40cf[U]; | |
1279 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1280 int Cr= yuvtab_3343[V]; | |
1281 | |
1282 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)]; | |
1283 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)]; | |
1284 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)]; | |
1285 | |
1286 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)]; | |
1287 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)]; | |
1288 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)]; | |
1289 } | |
1290 } | |
3344 | 1291 else if(dstbpp==24) |
2575 | 1292 { |
2671 | 1293 int i; |
3209 | 1294 for(i=0; i<dstW-1; i+=2){ |
2575 | 1295 // vertical linear interpolation && yuv2rgb in a single step: |
1296 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | |
1297 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)]; | |
2585 | 1298 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1299 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2575 | 1300 |
1301 int Cb= yuvtab_40cf[U]; | |
1302 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1303 int Cr= yuvtab_3343[V]; | |
1304 | |
1305 dest[0]=clip_table[((Y1 + Cb) >>13)]; | |
1306 dest[1]=clip_table[((Y1 + Cg) >>13)]; | |
1307 dest[2]=clip_table[((Y1 + Cr) >>13)]; | |
1308 | |
1309 dest[3]=clip_table[((Y2 + Cb) >>13)]; | |
1310 dest[4]=clip_table[((Y2 + Cg) >>13)]; | |
1311 dest[5]=clip_table[((Y2 + Cr) >>13)]; | |
1312 dest+=6; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1313 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1314 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1315 else if(dstbpp==16) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1316 { |
2671 | 1317 int i; |
3209 | 1318 for(i=0; i<dstW-1; i+=2){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1319 // vertical linear interpolation && yuv2rgb in a single step: |
2575 | 1320 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
1321 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)]; | |
2585 | 1322 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1323 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1324 |
2575 | 1325 int Cb= yuvtab_40cf[U]; |
1326 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1327 int Cr= yuvtab_3343[V]; | |
1328 | |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1329 ((uint16_t*)dest)[i] = |
2584 | 1330 clip_table16b[(Y1 + Cb) >>13] | |
1331 clip_table16g[(Y1 + Cg) >>13] | | |
1332 clip_table16r[(Y1 + Cr) >>13]; | |
2575 | 1333 |
1334 ((uint16_t*)dest)[i+1] = | |
2584 | 1335 clip_table16b[(Y2 + Cb) >>13] | |
1336 clip_table16g[(Y2 + Cg) >>13] | | |
1337 clip_table16r[(Y2 + Cr) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1338 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1339 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1340 else if(dstbpp==15) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1341 { |
2671 | 1342 int i; |
3209 | 1343 for(i=0; i<dstW-1; i+=2){ |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1344 // vertical linear interpolation && yuv2rgb in a single step: |
2575 | 1345 int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; |
1346 int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)]; | |
2585 | 1347 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1348 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1349 |
2575 | 1350 int Cb= yuvtab_40cf[U]; |
1351 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1352 int Cr= yuvtab_3343[V]; | |
1353 | |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1354 ((uint16_t*)dest)[i] = |
2584 | 1355 clip_table15b[(Y1 + Cb) >>13] | |
1356 clip_table15g[(Y1 + Cg) >>13] | | |
1357 clip_table15r[(Y1 + Cr) >>13]; | |
1358 | |
2575 | 1359 ((uint16_t*)dest)[i+1] = |
2584 | 1360 clip_table15b[(Y2 + Cb) >>13] | |
1361 clip_table15g[(Y2 + Cg) >>13] | | |
1362 clip_table15r[(Y2 + Cr) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1363 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1364 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1365 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1366 } //!FULL_UV_IPOL |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1367 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1368 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1369 /** |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1370 * YV12 to RGB without scaling or interpolating |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1371 */ |
3344 | 1372 static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, |
1373 uint8_t *dest, int dstW, int uvalpha, int dstbpp) | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1374 { |
2671 | 1375 int uvalpha1=uvalpha^4095; |
3344 | 1376 const int yalpha=0; |
1377 const int yalpha1=0; | |
2671 | 1378 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1379 if(fullUVIpol || allwaysIpol) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1380 { |
3344 | 1381 RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstbpp); |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1382 return; |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1383 } |
2576 | 1384 |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1385 #ifdef HAVE_MMX |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1386 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1387 { |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1388 if(dstbpp == 32) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1389 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1390 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1391 YSCALEYUV2RGB1 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1392 WRITEBGR32 |
3344 | 1393 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1394 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1395 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1396 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1397 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1398 else if(dstbpp==24) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1399 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1400 asm volatile( |
2728 | 1401 "movl %4, %%ebx \n\t" |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1402 YSCALEYUV2RGB1 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1403 WRITEBGR24 |
3344 | 1404 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1405 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1406 : "%eax", "%ebx" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1407 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1408 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1409 else if(dstbpp==15) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1410 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1411 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1412 YSCALEYUV2RGB1 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1413 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1414 #ifdef DITHER1XBPP |
2748 | 1415 "paddusb b5Dither, %%mm2 \n\t" |
1416 "paddusb g5Dither, %%mm4 \n\t" | |
1417 "paddusb r5Dither, %%mm5 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1418 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1419 WRITEBGR15 |
3344 | 1420 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1421 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1422 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1423 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1424 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1425 else if(dstbpp==16) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1426 { |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1427 asm volatile( |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1428 YSCALEYUV2RGB1 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1429 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1430 #ifdef DITHER1XBPP |
2748 | 1431 "paddusb b5Dither, %%mm2 \n\t" |
1432 "paddusb g6Dither, %%mm4 \n\t" | |
1433 "paddusb r5Dither, %%mm5 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1434 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1435 |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1436 WRITEBGR16 |
3344 | 1437 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1438 "m" (yalpha1), "m" (uvalpha1) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1439 : "%eax" |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1440 ); |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1441 } |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1442 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1443 else |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1444 { |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1445 if(dstbpp == 32) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1446 { |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1447 asm volatile( |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1448 YSCALEYUV2RGB1b |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1449 WRITEBGR32 |
3344 | 1450 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1451 "m" (yalpha1), "m" (uvalpha1) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1452 : "%eax" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1453 ); |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1454 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1455 else if(dstbpp==24) |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1456 { |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1457 asm volatile( |
2728 | 1458 "movl %4, %%ebx \n\t" |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1459 YSCALEYUV2RGB1b |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1460 WRITEBGR24 |
3344 | 1461 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1462 "m" (yalpha1), "m" (uvalpha1) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1463 : "%eax", "%ebx" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1464 ); |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1465 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1466 else if(dstbpp==15) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1467 { |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1468 asm volatile( |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1469 YSCALEYUV2RGB1b |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1470 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1471 #ifdef DITHER1XBPP |
2748 | 1472 "paddusb b5Dither, %%mm2 \n\t" |
1473 "paddusb g5Dither, %%mm4 \n\t" | |
1474 "paddusb r5Dither, %%mm5 \n\t" | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1475 #endif |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1476 WRITEBGR15 |
3344 | 1477 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1478 "m" (yalpha1), "m" (uvalpha1) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1479 : "%eax" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1480 ); |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1481 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1482 else if(dstbpp==16) |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1483 { |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1484 asm volatile( |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1485 YSCALEYUV2RGB1b |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1486 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1487 #ifdef DITHER1XBPP |
2748 | 1488 "paddusb b5Dither, %%mm2 \n\t" |
1489 "paddusb g6Dither, %%mm4 \n\t" | |
1490 "paddusb r5Dither, %%mm5 \n\t" | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1491 #endif |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1492 |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1493 WRITEBGR16 |
3344 | 1494 :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1495 "m" (yalpha1), "m" (uvalpha1) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1496 : "%eax" |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1497 ); |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1498 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1499 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1500 #else |
2576 | 1501 //FIXME write 2 versions (for even & odd lines) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1502 |
2576 | 1503 if(dstbpp==32) |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1504 { |
2671 | 1505 int i; |
3209 | 1506 for(i=0; i<dstW-1; i+=2){ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1507 // vertical linear interpolation && yuv2rgb in a single step: |
2576 | 1508 int Y1=yuvtab_2568[buf0[i]>>7]; |
1509 int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
2585 | 1510 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1511 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2576 | 1512 |
1513 int Cb= yuvtab_40cf[U]; | |
1514 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1515 int Cr= yuvtab_3343[V]; | |
1516 | |
1517 dest[4*i+0]=clip_table[((Y1 + Cb) >>13)]; | |
1518 dest[4*i+1]=clip_table[((Y1 + Cg) >>13)]; | |
1519 dest[4*i+2]=clip_table[((Y1 + Cr) >>13)]; | |
1520 | |
1521 dest[4*i+4]=clip_table[((Y2 + Cb) >>13)]; | |
1522 dest[4*i+5]=clip_table[((Y2 + Cg) >>13)]; | |
1523 dest[4*i+6]=clip_table[((Y2 + Cr) >>13)]; | |
1524 } | |
1525 } | |
3344 | 1526 else if(dstbpp==24) |
2576 | 1527 { |
2671 | 1528 int i; |
3209 | 1529 for(i=0; i<dstW-1; i+=2){ |
2576 | 1530 // vertical linear interpolation && yuv2rgb in a single step: |
1531 int Y1=yuvtab_2568[buf0[i]>>7]; | |
1532 int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
2585 | 1533 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1534 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2576 | 1535 |
1536 int Cb= yuvtab_40cf[U]; | |
1537 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1538 int Cr= yuvtab_3343[V]; | |
1539 | |
1540 dest[0]=clip_table[((Y1 + Cb) >>13)]; | |
1541 dest[1]=clip_table[((Y1 + Cg) >>13)]; | |
1542 dest[2]=clip_table[((Y1 + Cr) >>13)]; | |
1543 | |
1544 dest[3]=clip_table[((Y2 + Cb) >>13)]; | |
1545 dest[4]=clip_table[((Y2 + Cg) >>13)]; | |
1546 dest[5]=clip_table[((Y2 + Cr) >>13)]; | |
1547 dest+=6; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1548 } |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1549 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1550 else if(dstbpp==16) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1551 { |
2671 | 1552 int i; |
3209 | 1553 for(i=0; i<dstW-1; i+=2){ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1554 // vertical linear interpolation && yuv2rgb in a single step: |
2576 | 1555 int Y1=yuvtab_2568[buf0[i]>>7]; |
1556 int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
2585 | 1557 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1558 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1559 |
2576 | 1560 int Cb= yuvtab_40cf[U]; |
1561 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1562 int Cr= yuvtab_3343[V]; | |
1563 | |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1564 ((uint16_t*)dest)[i] = |
2584 | 1565 clip_table16b[(Y1 + Cb) >>13] | |
1566 clip_table16g[(Y1 + Cg) >>13] | | |
1567 clip_table16r[(Y1 + Cr) >>13]; | |
2576 | 1568 |
1569 ((uint16_t*)dest)[i+1] = | |
2584 | 1570 clip_table16b[(Y2 + Cb) >>13] | |
1571 clip_table16g[(Y2 + Cg) >>13] | | |
1572 clip_table16r[(Y2 + Cr) >>13]; | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1573 } |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1574 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1575 else if(dstbpp==15) |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1576 { |
2671 | 1577 int i; |
3209 | 1578 for(i=0; i<dstW-1; i+=2){ |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1579 // vertical linear interpolation && yuv2rgb in a single step: |
2576 | 1580 int Y1=yuvtab_2568[buf0[i]>>7]; |
1581 int Y2=yuvtab_2568[buf0[i+1]>>7]; | |
2585 | 1582 int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19); |
1583 int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19); | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1584 |
2576 | 1585 int Cb= yuvtab_40cf[U]; |
1586 int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U]; | |
1587 int Cr= yuvtab_3343[V]; | |
1588 | |
2572
f2353173d52c
c optimizations (array is faster than pointer) (16bpp variants tested and 2% faster)
michael
parents:
2569
diff
changeset
|
1589 ((uint16_t*)dest)[i] = |
2584 | 1590 clip_table15b[(Y1 + Cb) >>13] | |
1591 clip_table15g[(Y1 + Cg) >>13] | | |
1592 clip_table15r[(Y1 + Cr) >>13]; | |
1593 | |
2576 | 1594 ((uint16_t*)dest)[i+1] = |
2584 | 1595 clip_table15b[(Y2 + Cb) >>13] | |
1596 clip_table15g[(Y2 + Cg) >>13] | | |
1597 clip_table15r[(Y2 + Cr) >>13]; | |
2569
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1598 } |
30b736e7feef
interpolate chrominance for every second line in the 1:1 vertical scale function
michael
parents:
2566
diff
changeset
|
1599 } |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1600 #endif |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1601 } |
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1602 |
3272 | 1603 // Bilinear / Bicubic scaling |
1604 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, | |
1605 int16_t *filter, int16_t *filterPos, int filterSize) | |
1606 { | |
1607 #ifdef HAVE_MMX | |
1608 if(filterSize==4) // allways true for upscaling, sometimes for down too | |
1609 { | |
1610 int counter= -2*dstW; | |
1611 filter-= counter*2; | |
1612 filterPos-= counter/2; | |
1613 dst-= counter/2; | |
1614 asm volatile( | |
1615 "pxor %%mm7, %%mm7 \n\t" | |
1616 "movq w02, %%mm6 \n\t" | |
1617 "pushl %%ebp \n\t" // we use 7 regs here ... | |
1618 "movl %%eax, %%ebp \n\t" | |
1619 ".balign 16 \n\t" | |
1620 "1: \n\t" | |
1621 "movzwl (%2, %%ebp), %%eax \n\t" | |
1622 "movzwl 2(%2, %%ebp), %%ebx \n\t" | |
1623 "movq (%1, %%ebp, 4), %%mm1 \n\t" | |
1624 "movq 8(%1, %%ebp, 4), %%mm3 \n\t" | |
1625 "movd (%3, %%eax), %%mm0 \n\t" | |
1626 "movd (%3, %%ebx), %%mm2 \n\t" | |
1627 "punpcklbw %%mm7, %%mm0 \n\t" | |
1628 "punpcklbw %%mm7, %%mm2 \n\t" | |
1629 "pmaddwd %%mm1, %%mm0 \n\t" | |
1630 "pmaddwd %%mm2, %%mm3 \n\t" | |
1631 "psrad $8, %%mm0 \n\t" | |
1632 "psrad $8, %%mm3 \n\t" | |
1633 "packssdw %%mm3, %%mm0 \n\t" | |
1634 "pmaddwd %%mm6, %%mm0 \n\t" | |
1635 "packssdw %%mm0, %%mm0 \n\t" | |
1636 "movd %%mm0, (%4, %%ebp) \n\t" | |
1637 "addl $4, %%ebp \n\t" | |
1638 " jnc 1b \n\t" | |
1639 | |
1640 "popl %%ebp \n\t" | |
1641 : "+a" (counter) | |
1642 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
1643 : "%ebx" | |
1644 ); | |
1645 } | |
1646 else if(filterSize==8) | |
1647 { | |
1648 int counter= -2*dstW; | |
1649 filter-= counter*4; | |
1650 filterPos-= counter/2; | |
1651 dst-= counter/2; | |
1652 asm volatile( | |
1653 "pxor %%mm7, %%mm7 \n\t" | |
1654 "movq w02, %%mm6 \n\t" | |
1655 "pushl %%ebp \n\t" // we use 7 regs here ... | |
1656 "movl %%eax, %%ebp \n\t" | |
1657 ".balign 16 \n\t" | |
1658 "1: \n\t" | |
1659 "movzwl (%2, %%ebp), %%eax \n\t" | |
1660 "movzwl 2(%2, %%ebp), %%ebx \n\t" | |
1661 "movq (%1, %%ebp, 8), %%mm1 \n\t" | |
1662 "movq 16(%1, %%ebp, 8), %%mm3 \n\t" | |
1663 "movd (%3, %%eax), %%mm0 \n\t" | |
1664 "movd (%3, %%ebx), %%mm2 \n\t" | |
1665 "punpcklbw %%mm7, %%mm0 \n\t" | |
1666 "punpcklbw %%mm7, %%mm2 \n\t" | |
1667 "pmaddwd %%mm1, %%mm0 \n\t" | |
1668 "pmaddwd %%mm2, %%mm3 \n\t" | |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
1669 |
3272 | 1670 "movq 8(%1, %%ebp, 8), %%mm1 \n\t" |
1671 "movq 24(%1, %%ebp, 8), %%mm5 \n\t" | |
1672 "movd 4(%3, %%eax), %%mm4 \n\t" | |
1673 "movd 4(%3, %%ebx), %%mm2 \n\t" | |
1674 "punpcklbw %%mm7, %%mm4 \n\t" | |
1675 "punpcklbw %%mm7, %%mm2 \n\t" | |
1676 "pmaddwd %%mm1, %%mm4 \n\t" | |
1677 "pmaddwd %%mm2, %%mm5 \n\t" | |
1678 "paddd %%mm4, %%mm0 \n\t" | |
1679 "paddd %%mm5, %%mm3 \n\t" | |
1680 | |
1681 "psrad $8, %%mm0 \n\t" | |
1682 "psrad $8, %%mm3 \n\t" | |
1683 "packssdw %%mm3, %%mm0 \n\t" | |
1684 "pmaddwd %%mm6, %%mm0 \n\t" | |
1685 "packssdw %%mm0, %%mm0 \n\t" | |
1686 "movd %%mm0, (%4, %%ebp) \n\t" | |
1687 "addl $4, %%ebp \n\t" | |
1688 " jnc 1b \n\t" | |
3344 | 1689 |
3272 | 1690 "popl %%ebp \n\t" |
1691 : "+a" (counter) | |
1692 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |
1693 : "%ebx" | |
1694 ); | |
1695 } | |
1696 else | |
1697 { | |
1698 int counter= -2*dstW; | |
1699 // filter-= counter*filterSize/2; | |
1700 filterPos-= counter/2; | |
1701 dst-= counter/2; | |
1702 asm volatile( | |
1703 "pxor %%mm7, %%mm7 \n\t" | |
1704 "movq w02, %%mm6 \n\t" | |
1705 ".balign 16 \n\t" | |
1706 "1: \n\t" | |
1707 "movl %2, %%ecx \n\t" | |
1708 "movzwl (%%ecx, %0), %%eax \n\t" | |
1709 "movzwl 2(%%ecx, %0), %%ebx \n\t" | |
1710 "movl %5, %%ecx \n\t" | |
1711 "pxor %%mm4, %%mm4 \n\t" | |
1712 "pxor %%mm5, %%mm5 \n\t" | |
1713 "2: \n\t" | |
1714 "movq (%1), %%mm1 \n\t" | |
1715 "movq (%1, %6), %%mm3 \n\t" | |
1716 "movd (%%ecx, %%eax), %%mm0 \n\t" | |
1717 "movd (%%ecx, %%ebx), %%mm2 \n\t" | |
1718 "punpcklbw %%mm7, %%mm0 \n\t" | |
1719 "punpcklbw %%mm7, %%mm2 \n\t" | |
1720 "pmaddwd %%mm1, %%mm0 \n\t" | |
1721 "pmaddwd %%mm2, %%mm3 \n\t" | |
1722 "paddd %%mm3, %%mm5 \n\t" | |
1723 "paddd %%mm0, %%mm4 \n\t" | |
1724 "addl $8, %1 \n\t" | |
1725 "addl $4, %%ecx \n\t" | |
1726 "cmpl %4, %%ecx \n\t" | |
1727 " jb 2b \n\t" | |
1728 "addl %6, %1 \n\t" | |
1729 "psrad $8, %%mm4 \n\t" | |
1730 "psrad $8, %%mm5 \n\t" | |
1731 "packssdw %%mm5, %%mm4 \n\t" | |
1732 "pmaddwd %%mm6, %%mm4 \n\t" | |
1733 "packssdw %%mm4, %%mm4 \n\t" | |
1734 "movl %3, %%eax \n\t" | |
1735 "movd %%mm4, (%%eax, %0) \n\t" | |
1736 "addl $4, %0 \n\t" | |
1737 " jnc 1b \n\t" | |
3344 | 1738 |
3272 | 1739 : "+r" (counter) |
1740 : "r" (filter), "m" (filterPos), "m" (dst), "m"(src+filterSize), | |
1741 "m" (src), "r" (filterSize*2) | |
3299 | 1742 : "%ebx", "%eax", "%ecx" |
3272 | 1743 ); |
1744 } | |
1745 #else | |
1746 int i; | |
1747 for(i=0; i<dstW; i++) | |
1748 { | |
1749 int j; | |
1750 int srcPos= filterPos[i]; | |
1751 int val=0; | |
3344 | 1752 // printf("filterPos: %d\n", filterPos[i]); |
3272 | 1753 for(j=0; j<filterSize; j++) |
1754 { | |
1755 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); | |
1756 val += ((int)src[srcPos + j])*filter[filterSize*i + j]; | |
1757 } | |
1758 // filter += hFilterSize; | |
1759 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ... | |
1760 // dst[i] = val>>7; | |
1761 } | |
1762 #endif | |
1763 } | |
1764 // *** horizontal scale Y line to temp buffer | |
3215 | 1765 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc) |
2469 | 1766 { |
3272 | 1767 if(sws_flags != SWS_FAST_BILINEAR) |
1768 { | |
1769 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); | |
1770 } | |
1771 else // Fast Bilinear upscale / crap downscale | |
1772 { | |
2469 | 1773 #ifdef ARCH_X86 |
1774 #ifdef HAVE_MMX2 | |
2671 | 1775 int i; |
2469 | 1776 if(canMMX2BeUsed) |
1777 { | |
1778 asm volatile( | |
1779 "pxor %%mm7, %%mm7 \n\t" | |
1780 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |
1781 "movd %5, %%mm6 \n\t" // xInc&0xFFFF | |
1782 "punpcklwd %%mm6, %%mm6 \n\t" | |
1783 "punpcklwd %%mm6, %%mm6 \n\t" | |
1784 "movq %%mm6, %%mm2 \n\t" | |
1785 "psllq $16, %%mm2 \n\t" | |
1786 "paddw %%mm6, %%mm2 \n\t" | |
1787 "psllq $16, %%mm2 \n\t" | |
1788 "paddw %%mm6, %%mm2 \n\t" | |
1789 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF | |
1790 "movq %%mm2, temp0 \n\t" | |
1791 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF | |
1792 "punpcklwd %%mm6, %%mm6 \n\t" | |
1793 "punpcklwd %%mm6, %%mm6 \n\t" | |
1794 "xorl %%eax, %%eax \n\t" // i | |
1795 "movl %0, %%esi \n\t" // src | |
1796 "movl %1, %%edi \n\t" // buf1 | |
1797 "movl %3, %%edx \n\t" // (xInc*4)>>16 | |
1798 "xorl %%ecx, %%ecx \n\t" | |
1799 "xorl %%ebx, %%ebx \n\t" | |
1800 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF | |
2520 | 1801 |
2469 | 1802 #define FUNNY_Y_CODE \ |
2520 | 1803 PREFETCH" 1024(%%esi) \n\t"\ |
1804 PREFETCH" 1056(%%esi) \n\t"\ | |
1805 PREFETCH" 1088(%%esi) \n\t"\ | |
2469 | 1806 "call funnyYCode \n\t"\ |
1807 "movq temp0, %%mm2 \n\t"\ | |
1808 "xorl %%ecx, %%ecx \n\t" | |
2520 | 1809 |
2469 | 1810 FUNNY_Y_CODE |
1811 FUNNY_Y_CODE | |
1812 FUNNY_Y_CODE | |
1813 FUNNY_Y_CODE | |
1814 FUNNY_Y_CODE | |
1815 FUNNY_Y_CODE | |
1816 FUNNY_Y_CODE | |
1817 FUNNY_Y_CODE | |
1818 | |
1819 :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16), | |
1820 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF) | |
1821 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |
1822 ); | |
3215 | 1823 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; |
2469 | 1824 } |
1825 else | |
1826 { | |
1827 #endif | |
1828 //NO MMX just normal asm ... | |
1829 asm volatile( | |
1830 "xorl %%eax, %%eax \n\t" // i | |
1831 "xorl %%ebx, %%ebx \n\t" // xx | |
1832 "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
1833 ".balign 16 \n\t" |
2469 | 1834 "1: \n\t" |
1835 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |
1836 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |
1837 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
1838 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
1839 "shll $16, %%edi \n\t" | |
1840 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
1841 "movl %1, %%edi \n\t" | |
1842 "shrl $9, %%esi \n\t" | |
1843 "movw %%si, (%%edi, %%eax, 2) \n\t" | |
1844 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
1845 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry | |
1846 | |
1847 "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |
1848 "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |
1849 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
1850 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
1851 "shll $16, %%edi \n\t" | |
1852 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
1853 "movl %1, %%edi \n\t" | |
1854 "shrl $9, %%esi \n\t" | |
1855 "movw %%si, 2(%%edi, %%eax, 2) \n\t" | |
1856 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
1857 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry | |
1858 | |
1859 | |
1860 "addl $2, %%eax \n\t" | |
1861 "cmpl %2, %%eax \n\t" | |
1862 " jb 1b \n\t" | |
1863 | |
1864 | |
1865 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF) | |
1866 : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |
1867 ); | |
1868 #ifdef HAVE_MMX2 | |
1869 } //if MMX2 cant be used | |
1870 #endif | |
1871 #else | |
2671 | 1872 int i; |
1873 unsigned int xpos=0; | |
1874 for(i=0;i<dstWidth;i++) | |
1875 { | |
1876 register unsigned int xx=xpos>>16; | |
1877 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
1878 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; | |
1879 xpos+=xInc; | |
1880 } | |
2469 | 1881 #endif |
3272 | 1882 } |
2469 | 1883 } |
1884 | |
3126 | 1885 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, |
3215 | 1886 uint8_t *src1, uint8_t *src2, int srcW, int xInc) |
2469 | 1887 { |
3272 | 1888 if(sws_flags != SWS_FAST_BILINEAR) |
1889 { | |
1890 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
1891 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | |
1892 } | |
1893 else // Fast Bilinear upscale / crap downscale | |
1894 { | |
2469 | 1895 #ifdef ARCH_X86 |
1896 #ifdef HAVE_MMX2 | |
2671 | 1897 int i; |
2469 | 1898 if(canMMX2BeUsed) |
1899 { | |
1900 asm volatile( | |
1901 "pxor %%mm7, %%mm7 \n\t" | |
1902 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |
1903 "movd %5, %%mm6 \n\t" // xInc&0xFFFF | |
1904 "punpcklwd %%mm6, %%mm6 \n\t" | |
1905 "punpcklwd %%mm6, %%mm6 \n\t" | |
1906 "movq %%mm6, %%mm2 \n\t" | |
1907 "psllq $16, %%mm2 \n\t" | |
1908 "paddw %%mm6, %%mm2 \n\t" | |
1909 "psllq $16, %%mm2 \n\t" | |
1910 "paddw %%mm6, %%mm2 \n\t" | |
1911 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF | |
1912 "movq %%mm2, temp0 \n\t" | |
1913 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF | |
1914 "punpcklwd %%mm6, %%mm6 \n\t" | |
1915 "punpcklwd %%mm6, %%mm6 \n\t" | |
1916 "xorl %%eax, %%eax \n\t" // i | |
1917 "movl %0, %%esi \n\t" // src | |
1918 "movl %1, %%edi \n\t" // buf1 | |
1919 "movl %3, %%edx \n\t" // (xInc*4)>>16 | |
1920 "xorl %%ecx, %%ecx \n\t" | |
1921 "xorl %%ebx, %%ebx \n\t" | |
1922 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF | |
1923 | |
1924 #define FUNNYUVCODE \ | |
2520 | 1925 PREFETCH" 1024(%%esi) \n\t"\ |
1926 PREFETCH" 1056(%%esi) \n\t"\ | |
1927 PREFETCH" 1088(%%esi) \n\t"\ | |
2469 | 1928 "call funnyUVCode \n\t"\ |
1929 "movq temp0, %%mm2 \n\t"\ | |
1930 "xorl %%ecx, %%ecx \n\t" | |
1931 | |
1932 FUNNYUVCODE | |
1933 FUNNYUVCODE | |
1934 FUNNYUVCODE | |
1935 FUNNYUVCODE | |
1936 | |
1937 FUNNYUVCODE | |
1938 FUNNYUVCODE | |
1939 FUNNYUVCODE | |
1940 FUNNYUVCODE | |
1941 "xorl %%eax, %%eax \n\t" // i | |
1942 "movl %6, %%esi \n\t" // src | |
1943 "movl %1, %%edi \n\t" // buf1 | |
1944 "addl $4096, %%edi \n\t" | |
1945 | |
1946 FUNNYUVCODE | |
1947 FUNNYUVCODE | |
1948 FUNNYUVCODE | |
1949 FUNNYUVCODE | |
1950 | |
1951 FUNNYUVCODE | |
1952 FUNNYUVCODE | |
1953 FUNNYUVCODE | |
1954 FUNNYUVCODE | |
1955 | |
1956 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16), | |
1957 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2) | |
1958 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |
1959 ); | |
3344 | 1960 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) |
2469 | 1961 { |
3344 | 1962 // printf("%d %d %d\n", dstWidth, i, srcW); |
1963 dst[i] = src1[srcW-1]*128; | |
1964 dst[i+2048] = src2[srcW-1]*128; | |
2469 | 1965 } |
1966 } | |
1967 else | |
1968 { | |
1969 #endif | |
1970 asm volatile( | |
1971 "xorl %%eax, %%eax \n\t" // i | |
1972 "xorl %%ebx, %%ebx \n\t" // xx | |
1973 "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |
2800
7847d6b7ad3d
.balign or we¡ll align by 64kb on some architectures
michael
parents:
2799
diff
changeset
|
1974 ".balign 16 \n\t" |
2469 | 1975 "1: \n\t" |
1976 "movl %0, %%esi \n\t" | |
1977 "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx] | |
1978 "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1] | |
1979 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
1980 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
1981 "shll $16, %%edi \n\t" | |
1982 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
1983 "movl %1, %%edi \n\t" | |
1984 "shrl $9, %%esi \n\t" | |
1985 "movw %%si, (%%edi, %%eax, 2) \n\t" | |
1986 | |
1987 "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] | |
1988 "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] | |
1989 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |
1990 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |
1991 "shll $16, %%edi \n\t" | |
1992 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |
1993 "movl %1, %%edi \n\t" | |
1994 "shrl $9, %%esi \n\t" | |
1995 "movw %%si, 4096(%%edi, %%eax, 2)\n\t" | |
1996 | |
1997 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | |
1998 "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry | |
1999 "addl $1, %%eax \n\t" | |
2000 "cmpl %2, %%eax \n\t" | |
2001 " jb 1b \n\t" | |
2002 | |
2003 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF), | |
2004 "r" (src2) | |
2005 : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |
2006 ); | |
2007 #ifdef HAVE_MMX2 | |
2008 } //if MMX2 cant be used | |
2009 #endif | |
2010 #else | |
2671 | 2011 int i; |
2012 unsigned int xpos=0; | |
2013 for(i=0;i<dstWidth;i++) | |
2014 { | |
2015 register unsigned int xx=xpos>>16; | |
2016 register unsigned int xalpha=(xpos&0xFFFF)>>9; | |
2017 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |
2018 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | |
2566 | 2019 /* slower |
2020 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; | |
2021 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; | |
2022 */ | |
2671 | 2023 xpos+=xInc; |
2024 } | |
2469 | 2025 #endif |
3272 | 2026 } |
2027 } | |
2028 | |
3344 | 2029 static inline void RENAME(initFilter)(int16_t *filter, int16_t *filterPos, int *filterSize, int xInc, |
2030 int srcW, int dstW, int filterAlign, int one) | |
3272 | 2031 { |
2032 int i; | |
2033 #ifdef HAVE_MMX | |
3344 | 2034 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions) |
3272 | 2035 #endif |
3344 | 2036 |
2037 if(ABS(xInc - 0x10000) <10) // unscaled | |
2038 { | |
2039 int i; | |
2040 *filterSize= (1 +(filterAlign-1)) & (~(filterAlign-1)); // 1 or 4 normaly | |
2041 for(i=0; i<dstW*(*filterSize); i++) filter[i]=0; | |
2042 | |
2043 for(i=0; i<dstW; i++) | |
2044 { | |
2045 filter[i*(*filterSize)]=1; | |
2046 filterPos[i]=i; | |
2047 } | |
2048 | |
2049 } | |
2050 else if(xInc <= (1<<16) || sws_flags==SWS_FAST_BILINEAR) // upscale | |
3272 | 2051 { |
2052 int i; | |
2053 int xDstInSrc; | |
2054 if(sws_flags==SWS_BICUBIC) *filterSize= 4; | |
2055 else *filterSize= 2; | |
2056 // printf("%d %d %d\n", filterSize, srcW, dstW); | |
3344 | 2057 *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1)); |
2058 | |
3272 | 2059 xDstInSrc= xInc - 0x8000; |
2060 for(i=0; i<dstW; i++) | |
2061 { | |
2062 int xx= (xDstInSrc>>16) - (*filterSize>>1) + 1; | |
2063 int j; | |
3344 | 2064 |
3272 | 2065 filterPos[i]= xx; |
2066 if(sws_flags == SWS_BICUBIC) | |
3344 | 2067 { |
3272 | 2068 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16); |
2069 // int coeff; | |
2070 int y1,y2,y3,y4; | |
2071 double A= -0.75; | |
2072 // Equation is from VirtualDub | |
2073 y1 = (int)floor(0.5 + ( + A*d - 2.0*A*d*d + A*d*d*d) * 16384.0); | |
2074 y2 = (int)floor(0.5 + (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d) * 16384.0); | |
2075 y3 = (int)floor(0.5 + ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d) * 16384.0); | |
2076 y4 = (int)floor(0.5 + ( + A*d*d - A*d*d*d) * 16384.0); | |
3344 | 2077 |
3272 | 2078 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc); |
2079 filter[i*(*filterSize) + 0]= y1; | |
2080 filter[i*(*filterSize) + 1]= y2; | |
2081 filter[i*(*filterSize) + 2]= y3; | |
2082 filter[i*(*filterSize) + 3]= y4; | |
2083 // printf("%1.3f %d, %d, %d, %d\n",d , y1, y2, y3, y4); | |
2084 } | |
3344 | 2085 else |
3272 | 2086 { |
2087 for(j=0; j<*filterSize; j++) | |
2088 { | |
2089 double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16); | |
2090 int coeff; | |
2091 coeff= (int)(0.5 + (1.0 - d)*(1<<14)); | |
3344 | 2092 if(coeff<0) coeff=0; |
3272 | 2093 // printf("%d %d %d \n", coeff, (int)d, xDstInSrc); |
2094 filter[i*(*filterSize) + j]= coeff; | |
2095 xx++; | |
2096 } | |
3344 | 2097 } |
2098 xDstInSrc+= xInc; | |
3272 | 2099 } |
2100 } | |
2101 else // downscale | |
2102 { | |
2103 int xDstInSrc; | |
2104 if(sws_flags==SWS_BICUBIC) *filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW); | |
2105 else *filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW); | |
2106 // printf("%d %d %d\n", *filterSize, srcW, dstW); | |
3344 | 2107 *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1)); |
2108 | |
3272 | 2109 xDstInSrc= xInc - 0x8000; |
2110 for(i=0; i<dstW; i++) | |
2111 { | |
2112 int xx= (int)((double)xDstInSrc/(double)(1<<16) - *filterSize*0.5 + 0.5); | |
2113 int j; | |
3344 | 2114 |
3272 | 2115 filterPos[i]= xx; |
2116 for(j=0; j<*filterSize; j++) | |
2117 { | |
2118 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc; | |
2119 int coeff; | |
2120 if(sws_flags == SWS_BICUBIC) | |
3344 | 2121 { |
3272 | 2122 double A= -0.75; |
2123 // d*=2; | |
2124 // Equation is from VirtualDub | |
2125 if(d<1.0) | |
3344 | 2126 coeff = (int)floor(0.5 + (1.0 - (A+3.0)*d*d |
3272 | 2127 + (A+2.0)*d*d*d) * (1<<14)); |
2128 else if(d<2.0) | |
3344 | 2129 coeff = (int)floor(0.5 + (-4.0*A + 8.0*A*d |
3272 | 2130 - 5.0*A*d*d + A*d*d*d) * (1<<14)); |
2131 else | |
2132 coeff=0; | |
2133 } | |
3344 | 2134 else |
3272 | 2135 { |
2136 coeff= (int)(0.5 + (1.0 - d)*(1<<14)); | |
3344 | 2137 if(coeff<0) coeff=0; |
3272 | 2138 } |
3344 | 2139 // if(filterAlign==1) printf("%d %d %d \n", coeff, (int)d, xDstInSrc); |
3272 | 2140 filter[i*(*filterSize) + j]= coeff; |
2141 xx++; | |
2142 } | |
2143 xDstInSrc+= xInc; | |
2144 } | |
2145 } | |
3344 | 2146 |
3272 | 2147 //fix borders |
2148 for(i=0; i<dstW; i++) | |
2149 { | |
2150 int j; | |
2151 if(filterPos[i] < 0) | |
2152 { | |
2153 // Move filter coeffs left to compensate for filterPos | |
2154 for(j=1; j<*filterSize; j++) | |
2155 { | |
2156 int left= MAX(j + filterPos[i], 0); | |
2157 filter[i*(*filterSize) + left] += filter[i*(*filterSize) + j]; | |
2158 filter[i*(*filterSize) + j]=0; | |
2159 } | |
2160 filterPos[i]= 0; | |
2161 } | |
2162 | |
2163 if(filterPos[i] + *filterSize > srcW) | |
2164 { | |
2165 int shift= filterPos[i] + *filterSize - srcW; | |
2166 // Move filter coeffs right to compensate for filterPos | |
2167 for(j=*filterSize-2; j>=0; j--) | |
2168 { | |
2169 int right= MIN(j + shift, *filterSize-1); | |
2170 filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j]; | |
2171 filter[i*(*filterSize) +j]=0; | |
2172 } | |
3344 | 2173 filterPos[i]= srcW - *filterSize; |
3272 | 2174 } |
2175 } | |
3344 | 2176 |
2177 //FIXME try to align filterpos if possible / try to shift filterpos to put zeros at the end | |
2178 // and skip these than later | |
2179 | |
3272 | 2180 //Normalize |
2181 for(i=0; i<dstW; i++) | |
2182 { | |
2183 int j; | |
2184 double sum=0; | |
3344 | 2185 double scale= one; |
3272 | 2186 for(j=0; j<*filterSize; j++) |
2187 { | |
2188 sum+= filter[i*(*filterSize) + j]; | |
2189 } | |
2190 scale/= sum; | |
2191 for(j=0; j<*filterSize; j++) | |
2192 { | |
2193 filter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale); | |
2194 } | |
2195 } | |
2469 | 2196 } |
2316
bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
michael
parents:
2297
diff
changeset
|
2197 |
3344 | 2198 #ifdef HAVE_MMX2 |
2199 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode) | |
2200 { | |
2201 uint8_t *fragment; | |
2202 int imm8OfPShufW1; | |
2203 int imm8OfPShufW2; | |
2204 int fragmentLength; | |
2205 | |
2206 int xpos, i; | |
2207 | |
2208 // create an optimized horizontal scaling routine | |
2209 | |
2210 //code fragment | |
2211 | |
2212 asm volatile( | |
2213 "jmp 9f \n\t" | |
2214 // Begin | |
2215 "0: \n\t" | |
2216 "movq (%%esi), %%mm0 \n\t" //FIXME Alignment | |
2217 "movq %%mm0, %%mm1 \n\t" | |
2218 "psrlq $8, %%mm0 \n\t" | |
2219 "punpcklbw %%mm7, %%mm1 \n\t" | |
2220 "movq %%mm2, %%mm3 \n\t" | |
2221 "punpcklbw %%mm7, %%mm0 \n\t" | |
2222 "addw %%bx, %%cx \n\t" //2*xalpha += (4*lumXInc)&0xFFFF | |
2223 "pshufw $0xFF, %%mm1, %%mm1 \n\t" | |
2224 "1: \n\t" | |
2225 "adcl %%edx, %%esi \n\t" //xx+= (4*lumXInc)>>16 + carry | |
2226 "pshufw $0xFF, %%mm0, %%mm0 \n\t" | |
2227 "2: \n\t" | |
2228 "psrlw $9, %%mm3 \n\t" | |
2229 "psubw %%mm1, %%mm0 \n\t" | |
2230 "pmullw %%mm3, %%mm0 \n\t" | |
2231 "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF | |
2232 "psllw $7, %%mm1 \n\t" | |
2233 "paddw %%mm1, %%mm0 \n\t" | |
2234 | |
2235 "movq %%mm0, (%%edi, %%eax) \n\t" | |
2236 | |
2237 "addl $8, %%eax \n\t" | |
2238 // End | |
2239 "9: \n\t" | |
2240 // "int $3\n\t" | |
2241 "leal 0b, %0 \n\t" | |
2242 "leal 1b, %1 \n\t" | |
2243 "leal 2b, %2 \n\t" | |
2244 "decl %1 \n\t" | |
2245 "decl %2 \n\t" | |
2246 "subl %0, %1 \n\t" | |
2247 "subl %0, %2 \n\t" | |
2248 "leal 9b, %3 \n\t" | |
2249 "subl %0, %3 \n\t" | |
2250 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2), | |
2251 "=r" (fragmentLength) | |
2252 ); | |
2253 | |
2254 xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers | |
2255 | |
2256 for(i=0; i<dstW/8; i++) | |
2257 { | |
2258 int xx=xpos>>16; | |
2259 | |
2260 if((i&3) == 0) | |
2261 { | |
2262 int a=0; | |
2263 int b=((xpos+xInc)>>16) - xx; | |
2264 int c=((xpos+xInc*2)>>16) - xx; | |
2265 int d=((xpos+xInc*3)>>16) - xx; | |
2266 | |
2267 memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength); | |
2268 | |
2269 funnyCode[fragmentLength*i/4 + imm8OfPShufW1]= | |
2270 funnyCode[fragmentLength*i/4 + imm8OfPShufW2]= | |
2271 a | (b<<2) | (c<<4) | (d<<6); | |
2272 | |
2273 // if we dont need to read 8 bytes than dont :), reduces the chance of | |
2274 // crossing a cache line | |
2275 if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E; | |
2276 | |
2277 funnyCode[fragmentLength*(i+4)/4]= RET; | |
2278 } | |
2279 xpos+=xInc; | |
2280 } | |
2281 /* | |
2282 xpos= 0; //chrXInc/2 - 0x10000; // difference between centers of chrom samples | |
2283 for(i=0; i<dstUVw/8; i++) | |
2284 { | |
2285 int xx=xpos>>16; | |
2286 | |
2287 if((i&3) == 0) | |
2288 { | |
2289 int a=0; | |
2290 int b=((xpos+chrXInc)>>16) - xx; | |
2291 int c=((xpos+chrXInc*2)>>16) - xx; | |
2292 int d=((xpos+chrXInc*3)>>16) - xx; | |
2293 | |
2294 memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength); | |
2295 | |
2296 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]= | |
2297 funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]= | |
2298 a | (b<<2) | (c<<4) | (d<<6); | |
2299 | |
2300 // if we dont need to read 8 bytes than dont :), reduces the chance of | |
2301 // crossing a cache line | |
2302 if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E; | |
2303 | |
2304 funnyUVCode[fragmentLength*(i+4)/4]= RET; | |
2305 } | |
2306 xpos+=chrXInc; | |
2307 } | |
2308 */ | |
2309 // funnyCode[0]= RET; | |
2310 } | |
2311 #endif // HAVE_MMX2 | |
2312 | |
3209 | 2313 static void RENAME(SwScale_YV12slice)(unsigned char* srcptr[],int stride[], int srcSliceY , |
2314 int srcSliceH, uint8_t* dstptr[], int dststride, int dstbpp, | |
2315 int srcW, int srcH, int dstW, int dstH){ | |
2216 | 2316 |
2317 | |
3344 | 2318 unsigned int lumXInc= (srcW << 16) / dstW; |
2319 unsigned int lumYInc= (srcH << 16) / dstH; | |
2320 unsigned int chrXInc; | |
2321 unsigned int chrYInc; | |
2322 | |
3209 | 2323 static int dstY; |
2216 | 2324 |
3344 | 2325 // used to detect a size change |
3272 | 2326 static int oldDstW= -1; |
2327 static int oldSrcW= -1; | |
3344 | 2328 static int oldDstH= -1; |
2329 static int oldSrcH= -1; | |
2330 static int oldFlags=-1; | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2331 |
3344 | 2332 static int lastInLumBuf; |
2333 static int lastInChrBuf; | |
2334 | |
2335 int chrDstW, chrDstH; | |
2336 | |
2337 static int lumBufIndex=0; | |
2338 static int chrBufIndex=0; | |
2216 | 2339 |
3344 | 2340 static int firstTime=1; |
2680 | 2341 |
3344 | 2342 int widthAlign= dstbpp==12 ? 16 : 8; |
2343 if(((dstW + widthAlign-1)&(~(widthAlign-1))) > dststride) | |
2344 { | |
2345 dstW&= ~(widthAlign-1); | |
2346 if(firstTime) | |
2347 fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n" | |
2348 "SwScaler: ->lowering width to compensate, new width=%d\n" | |
2349 "SwScaler: ->cannot do aligned memory acesses anymore\n", | |
2350 widthAlign, dstW); | |
2351 } | |
2680 | 2352 |
3215 | 2353 //printf("%d %d %d %d\n", srcW, srcH, dstW, dstH); |
3344 | 2354 //printf("%d %d %d %d\n", lumXInc, lumYInc, srcSliceY, srcSliceH); |
3215 | 2355 |
2270 | 2356 #ifdef HAVE_MMX2 |
3344 | 2357 canMMX2BeUsed= (lumXInc <= 0x10000 && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0; |
2358 if(!canMMX2BeUsed && lumXInc <= 0x10000 && (srcW&15)==0 && sws_flags==SWS_FAST_BILINEAR) | |
2359 { | |
2360 if(firstTime) //FIXME only if verbose ? | |
2361 fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n"); | |
2362 } | |
2363 #endif | |
2364 | |
2365 if(firstTime) | |
2366 { | |
3345 | 2367 #if defined (DITHER1XBPP) && defined (HAVE_MMX) |
2368 char *dither= " dithered"; | |
3344 | 2369 #else |
2370 char *dither= ""; | |
2270 | 2371 #endif |
3344 | 2372 if(sws_flags==SWS_FAST_BILINEAR) |
2373 fprintf(stderr, "SwScaler: FAST_BILINEAR scaler "); | |
2374 else if(sws_flags==SWS_BILINEAR) | |
2375 fprintf(stderr, "SwScaler: BILINEAR scaler "); | |
2376 else if(sws_flags==SWS_BICUBIC) | |
2377 fprintf(stderr, "SwScaler: BICUBIC scaler "); | |
2378 else | |
2379 fprintf(stderr, "SwScaler: ehh flags invalid?! "); | |
2380 | |
2381 if(dstbpp==15) | |
3345 | 2382 fprintf(stderr, "with%s BGR15 output ", dither); |
3344 | 2383 else if(dstbpp==16) |
3345 | 2384 fprintf(stderr, "with%s BGR16 output ", dither); |
3344 | 2385 else if(dstbpp==24) |
2386 fprintf(stderr, "with BGR24 output "); | |
2387 else if(dstbpp==32) | |
2388 fprintf(stderr, "with BGR32 output "); | |
2389 else if(dstbpp==12) | |
2390 fprintf(stderr, "with YV12 output "); | |
2391 else | |
2392 fprintf(stderr, "without output "); | |
2393 | |
2394 #ifdef HAVE_MMX2 | |
2395 fprintf(stderr, "using MMX2\n"); | |
2396 #elif defined (HAVE_3DNOW) | |
2397 fprintf(stderr, "using 3DNOW\n"); | |
2398 #elif defined (HAVE_MMX) | |
2399 fprintf(stderr, "using MMX\n"); | |
2400 #elif defined (ARCH_X86) | |
2401 fprintf(stderr, "using X86 ASM2\n"); | |
2402 #else | |
2403 fprintf(stderr, "using C\n"); | |
2404 #endif | |
2405 } | |
2406 | |
2270 | 2407 |
2279
9b9c3363abbe
horizontal scaling bugs fixed, should be mostly bugfree now
michael
parents:
2274
diff
changeset
|
2408 // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst |
9b9c3363abbe
horizontal scaling bugs fixed, should be mostly bugfree now
michael
parents:
2274
diff
changeset
|
2409 // n-2 is the last chrominance sample available |
3344 | 2410 // this is not perfect, but noone shuld notice the difference, the more correct variant |
2279
9b9c3363abbe
horizontal scaling bugs fixed, should be mostly bugfree now
michael
parents:
2274
diff
changeset
|
2411 // would be like the vertical one, but that would require some special code for the |
9b9c3363abbe
horizontal scaling bugs fixed, should be mostly bugfree now
michael
parents:
2274
diff
changeset
|
2412 // first and last pixel |
3344 | 2413 if(sws_flags==SWS_FAST_BILINEAR) |
2414 { | |
2415 if(canMMX2BeUsed) lumXInc+= 20; | |
2416 else lumXInc = ((srcW-2)<<16)/(dstW-2) - 20; | |
2417 } | |
2279
9b9c3363abbe
horizontal scaling bugs fixed, should be mostly bugfree now
michael
parents:
2274
diff
changeset
|
2418 |
3344 | 2419 if(fullUVIpol && !(dstbpp==12)) chrXInc= lumXInc>>1, chrDstW= dstW; |
2420 else chrXInc= lumXInc, chrDstW= dstW>>1; | |
2421 | |
2422 if(dstbpp==12) chrYInc= lumYInc, chrDstH= dstH>>1; | |
2423 else chrYInc= lumYInc>>1, chrDstH= dstH; | |
2424 | |
2271 | 2425 // force calculation of the horizontal interpolation of the first line |
2426 | |
3209 | 2427 if(srcSliceY ==0){ |
3215 | 2428 // printf("dstW %d, srcw %d, mmx2 %d\n", dstW, srcW, canMMX2BeUsed); |
3344 | 2429 lumBufIndex=0; |
2430 chrBufIndex=0; | |
3209 | 2431 dstY=0; |
2638
d4211422b6cc
right green line bugfix for width not %8==0 (untested -vo vesa doesnt work)
michael
parents:
2585
diff
changeset
|
2432 |
3344 | 2433 //precalculate horizontal scaler filter coefficients |
2434 if(oldDstW!=dstW || oldSrcW!=srcW || oldFlags!=sws_flags) | |
2638
d4211422b6cc
right green line bugfix for width not %8==0 (untested -vo vesa doesnt work)
michael
parents:
2585
diff
changeset
|
2435 { |
3344 | 2436 #ifdef HAVE_MMX |
2437 const int filterAlign=4; | |
2438 #else | |
2439 const int filterAlign=1; | |
2440 #endif | |
2441 oldDstW= dstW; oldSrcW= srcW; oldFlags= sws_flags; | |
2638
d4211422b6cc
right green line bugfix for width not %8==0 (untested -vo vesa doesnt work)
michael
parents:
2585
diff
changeset
|
2442 |
3344 | 2443 if(sws_flags != SWS_FAST_BILINEAR) |
2444 { | |
2445 RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc, | |
2446 srcW , dstW , filterAlign, 1<<14); | |
2447 RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc, | |
2448 srcW>>1, chrDstW, filterAlign, 1<<14); | |
2449 } | |
2450 | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2451 #ifdef HAVE_MMX2 |
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2452 // cant downscale !!! |
3344 | 2453 if(canMMX2BeUsed && sws_flags == SWS_FAST_BILINEAR) |
2454 { | |
2455 initMMX2HScaler(dstW , lumXInc, funnyYCode); | |
2456 initMMX2HScaler(chrDstW, chrXInc, funnyUVCode); | |
2457 } | |
2458 #endif | |
2459 } // Init Horizontal stuff | |
2460 | |
2461 if(oldDstH!=dstH || oldSrcH!=srcH || oldFlags!=sws_flags) | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2462 { |
3344 | 2463 int i; |
2464 oldDstH= dstH; oldSrcH= srcH; oldFlags= sws_flags; //FIXME swsflags conflict with x check | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2465 |
3344 | 2466 // deallocate pixbufs |
2467 for(i=0; i<vLumBufSize; i++) free(lumPixBuf[i]); | |
2468 for(i=0; i<vChrBufSize; i++) free(chrPixBuf[i]); | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2469 |
3344 | 2470 RENAME(initFilter)(vLumFilter, vLumFilterPos, &vLumFilterSize, lumYInc, |
2471 srcH , dstH, 1, (1<<12)-4); | |
2472 RENAME(initFilter)(vChrFilter, vChrFilterPos, &vChrFilterSize, chrYInc, | |
2473 srcH>>1, chrDstH, 1, (1<<12)-4); | |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
2474 |
3344 | 2475 // Calculate Buffer Sizes so that they wont run out while handling these damn slices |
2476 vLumBufSize= vLumFilterSize; vChrBufSize= vChrFilterSize; | |
2477 for(i=0; i<dstH; i++) | |
2478 { | |
2479 int chrI= i*chrDstH / dstH; | |
2480 int nextSlice= MAX(vLumFilterPos[i ] + vLumFilterSize - 1, | |
2481 ((vChrFilterPos[chrI] + vChrFilterSize - 1)<<1)); | |
2482 nextSlice&= ~1; // Slices start at even boundaries | |
2483 if(vLumFilterPos[i ] + vLumBufSize < nextSlice) | |
2484 vLumBufSize= nextSlice - vLumFilterPos[i ]; | |
2485 if(vChrFilterPos[chrI] + vChrBufSize < (nextSlice>>1)) | |
2486 vChrBufSize= (nextSlice>>1) - vChrFilterPos[chrI]; | |
2487 } | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2488 |
3344 | 2489 // allocate pixbufs (we use dynamic allocation because otherwise we would need to |
2490 // allocate several megabytes to handle all possible cases) | |
2491 for(i=0; i<vLumBufSize; i++) | |
2492 lumPixBuf[i]= lumPixBuf[i+vLumBufSize]= (uint16_t*)memalign(8, 4000); | |
2493 for(i=0; i<vChrBufSize; i++) | |
2494 chrPixBuf[i]= chrPixBuf[i+vChrBufSize]= (uint16_t*)memalign(8, 8000); | |
2495 | |
2496 //try to avoid drawing green stuff between the right end and the stride end | |
2497 for(i=0; i<vLumBufSize; i++) memset(lumPixBuf[i], 0, 4000); | |
2498 for(i=0; i<vChrBufSize; i++) memset(chrPixBuf[i], 64, 8000); | |
2499 | |
2500 #ifdef HAVE_MMX | |
2501 // pack filter data for mmx code | |
2502 for(i=0; i<vLumFilterSize*dstH; i++) | |
2503 lumMmxFilter[4*i]=lumMmxFilter[4*i+1]=lumMmxFilter[4*i+2]=lumMmxFilter[4*i+3]= | |
2504 vLumFilter[i]; | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2505 |
3344 | 2506 for(i=0; i<vChrFilterSize*chrDstH; i++) |
2507 chrMmxFilter[4*i]=chrMmxFilter[4*i+1]=chrMmxFilter[4*i+2]=chrMmxFilter[4*i+3]= | |
2508 vChrFilter[i]; | |
2509 #endif | |
2510 } | |
2511 | |
2512 lastInLumBuf= -1; | |
2513 lastInChrBuf= -1; | |
2514 } // if(firstLine) | |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
2515 |
3344 | 2516 for(;dstY < dstH; dstY++){ |
2517 unsigned char *dest =dstptr[0]+dststride*dstY; | |
2518 unsigned char *uDest=dstptr[1]+(dststride>>1)*(dstY>>1); | |
2519 unsigned char *vDest=dstptr[2]+(dststride>>1)*(dstY>>1); | |
2520 const int chrDstY= dstbpp==12 ? (dstY>>1) : dstY; | |
2521 | |
2522 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input | |
2523 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input | |
2524 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input | |
2525 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input | |
2526 | |
2527 if(sws_flags == SWS_FAST_BILINEAR) | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2528 { |
3344 | 2529 //handle holes |
2530 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; | |
2531 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; | |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2532 } |
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2533 |
3344 | 2534 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1) |
2535 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1) | |
2216 | 2536 |
3344 | 2537 // Do we have enough lines in this slice to output the dstY line |
2538 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH)>>1)) | |
2469 | 2539 { |
3344 | 2540 //Do horizontal scaling |
2541 while(lastInLumBuf < lastLumSrcY) | |
2542 { | |
2543 uint8_t *src= srcptr[0]+(lastInLumBuf + 1 - srcSliceY)*stride[0]; | |
2544 lumBufIndex++; | |
2545 ASSERT(lumBufIndex < 2*vLumBufSize) | |
2546 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
2547 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
2548 // printf("%d %d\n", lumBufIndex, vLumBufSize); | |
2549 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, src, srcW, lumXInc); | |
2550 lastInLumBuf++; | |
2551 } | |
2552 while(lastInChrBuf < lastChrSrcY) | |
2553 { | |
2554 uint8_t *src1= srcptr[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[1]; | |
2555 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2]; | |
2556 chrBufIndex++; | |
2557 ASSERT(chrBufIndex < 2*vChrBufSize) | |
2558 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1)) | |
2559 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) | |
2560 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, srcW>>1, chrXInc); | |
2561 lastInChrBuf++; | |
2562 } | |
2563 //wrap buf index around to stay inside the ring buffer | |
2564 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
2565 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
2469 | 2566 } |
3344 | 2567 else // not enough lines left in this slice -> load the rest in the buffer |
2469 | 2568 { |
3344 | 2569 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n", |
2570 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY, | |
2571 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize, | |
2572 vChrBufSize, vLumBufSize); | |
2573 */ | |
2574 //Do horizontal scaling | |
2575 while(lastInLumBuf+1 < srcSliceY + srcSliceH) | |
2469 | 2576 { |
3344 | 2577 uint8_t *src= srcptr[0]+(lastInLumBuf + 1 - srcSliceY)*stride[0]; |
2578 lumBufIndex++; | |
2579 ASSERT(lumBufIndex < 2*vLumBufSize) | |
2580 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | |
2581 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | |
2582 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, src, srcW, lumXInc); | |
2583 lastInLumBuf++; | |
2469 | 2584 } |
3344 | 2585 while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1)) |
2586 { | |
2587 uint8_t *src1= srcptr[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[1]; | |
2588 uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2]; | |
2589 chrBufIndex++; | |
2590 ASSERT(chrBufIndex < 2*vChrBufSize) | |
2591 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1)) | |
2592 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) | |
2593 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, srcW>>1, chrXInc); | |
2594 lastInChrBuf++; | |
2595 } | |
2596 //wrap buf index around to stay inside the ring buffer | |
2597 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | |
2598 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | |
2599 break; //we cant output a dstY line so lets try with the next slice | |
2469 | 2600 } |
2264
7851375ea156
increased precission of s_xinc s_xinc2 (needed for the mmx2 bugfix)
michael
parents:
2237
diff
changeset
|
2601 |
2748 | 2602 #ifdef HAVE_MMX |
3344 | 2603 b5Dither= dither8[dstY&1]; |
2604 g6Dither= dither4[dstY&1]; | |
2605 g5Dither= dither8[dstY&1]; | |
2606 r5Dither= dither8[(dstY+1)&1]; | |
2748 | 2607 #endif |
2232
65996b3467d7
MMX & MMX2 optimizations (MMX2 is buggy and commented out)
michael
parents:
2230
diff
changeset
|
2608 |
3344 | 2609 if(dstbpp==12) //YV12 |
2610 { | |
2611 if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | |
2612 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12 | |
2613 { | |
2614 int16_t *lumBuf = lumPixBuf[0]; | |
2615 int16_t *chrBuf= chrPixBuf[0]; | |
2616 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW); | |
2617 } | |
2618 else //General YV12 | |
2619 { | |
2620 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
2621 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
2622 RENAME(yuv2yuvX)( | |
2623 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | |
2624 vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2625 dest, uDest, vDest, dstW, | |
2626 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4); | |
2627 } | |
2628 } | |
2629 else | |
2630 { | |
2631 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | |
2632 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | |
2633 | |
2634 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |
2635 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | |
2636 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB | |
2637 { | |
2638 int chrAlpha= vChrFilter[2*dstY+1]; | |
2639 | |
2640 RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), | |
2641 dest, dstW, chrAlpha, dstbpp); | |
2642 } | |
2643 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB | |
2644 { | |
2645 int lumAlpha= vLumFilter[2*dstY+1]; | |
2646 int chrAlpha= vChrFilter[2*dstY+1]; | |
2647 | |
2648 RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), | |
2649 dest, dstW, lumAlpha, chrAlpha, dstbpp); | |
2650 } | |
2651 else //General RGB | |
2652 { | |
2653 RENAME(yuv2rgbX)( | |
2654 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | |
2655 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | |
2656 dest, dstW, dstbpp, | |
2657 lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4); | |
2658 } | |
2659 } | |
2660 } | |
2534
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2661 |
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2662 #ifdef HAVE_MMX |
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2663 __asm __volatile(SFENCE:::"memory"); |
2566 | 2664 __asm __volatile(EMMS:::"memory"); |
2534
cc9d3fd626f0
patch from Martin Decky <deckm1am@ss1000.ms.mff.cuni.cz> applied and unnecassery "memory" removed
michael
parents:
2521
diff
changeset
|
2665 #endif |
3344 | 2666 firstTime=0; |
2667 } |