Mercurial > mplayer.hg
annotate libmpcodecs/vf_ass.c @ 35579:6b169870ae30
Reduce register usage to fix the compilation in x86.
author | upsuper |
---|---|
date | Thu, 13 Dec 2012 02:17:27 +0000 |
parents | 2508973357da |
children | f47dbb805ecb |
rev | line source |
---|---|
20008
fa122b7c71c6
Add copyright notice and vim/emacs comments to libass and vf_ass.c.
eugeni
parents:
19563
diff
changeset
|
1 /* |
26727 | 2 * Copyright (C) 2006 Evgeniy Stepanov <eugeni.stepanov@gmail.com> |
35244 | 3 * Copyright (C) 2012 Xidorn Quan <quanxunzhen@gmail.com> |
26727 | 4 * |
5 * This file is part of MPlayer. | |
6 * | |
7 * MPlayer is free software; you can redistribute it and/or modify | |
8 * it under the terms of the GNU General Public License as published by | |
9 * the Free Software Foundation; either version 2 of the License, or | |
10 * (at your option) any later version. | |
11 * | |
12 * MPlayer is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 * GNU General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU General Public License along | |
18 * with MPlayer; if not, write to the Free Software Foundation, Inc., | |
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | |
20 */ | |
20008
fa122b7c71c6
Add copyright notice and vim/emacs comments to libass and vf_ass.c.
eugeni
parents:
19563
diff
changeset
|
21 |
18937 | 22 #include "config.h" |
23 | |
24 #include <stdio.h> | |
25 #include <stdlib.h> | |
26 #include <string.h> | |
24545
9e5126679d44
Replace stdint.h #include by functionally equivalent inttypes.h.
diego
parents:
23134
diff
changeset
|
27 #include <inttypes.h> |
18937 | 28 #include <assert.h> |
29 | |
30 #include "config.h" | |
31 #include "mp_msg.h" | |
32 #include "help_mp.h" | |
31489 | 33 #include "mpcommon.h" |
18937 | 34 #include "img_format.h" |
35 #include "mp_image.h" | |
30653
3d23e24c5c60
Declare externally used variables from vd.c as extern in vd.h.
diego
parents:
30642
diff
changeset
|
36 #include "vd.h" |
18937 | 37 #include "vf.h" |
38 | |
39 #include "libvo/fastmemcpy.h" | |
35039
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
40 #include "libavutil/intreadwrite.h" |
32467 | 41 #include "sub/sub.h" |
18937 | 42 #include "m_option.h" |
43 #include "m_struct.h" | |
44 | |
32461 | 45 #include "sub/ass_mp.h" |
32460 | 46 #include "sub/eosd.h" |
18937 | 47 |
35575
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
48 #include "cpudetect.h" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
49 #include "libavutil/x86_cpu.h" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
50 |
18937 | 51 #define _r(c) ((c)>>24) |
52 #define _g(c) (((c)>>16)&0xFF) | |
53 #define _b(c) (((c)>>8)&0xFF) | |
54 #define _a(c) ((c)&0xFF) | |
32096 | 55 #define rgba2y(c) ( (( 263*_r(c) + 516*_g(c) + 100*_b(c)) >> 10) + 16 ) |
18937 | 56 #define rgba2u(c) ( ((-152*_r(c) - 298*_g(c) + 450*_b(c)) >> 10) + 128 ) |
57 #define rgba2v(c) ( (( 450*_r(c) - 376*_g(c) - 73*_b(c)) >> 10) + 128 ) | |
58 | |
35244 | 59 /* map 0 - 0xFF -> 0 - 0x101 */ |
60 #define MAP_16BIT(v) RSHIFT(0x102 * (v), 8) | |
61 /* map 0 - 0xFF -> 0 - 0x10101 */ | |
62 #define MAP_24BIT(v) RSHIFT(0x10203 * (v), 8) | |
18937 | 63 |
35575
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
64 #if HAVE_SSE4 |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
65 |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
66 #define CLEAN_XMM(n) \ |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
67 __asm__ volatile ( "pxor %%xmm" #n ", %%xmm" #n " \n\t" : ) |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
68 DECLARE_ASM_CONST(16, uint32_t, SSE_32BIT_80H[4]) = { [0 ... 3] = 0x80 }; |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
69 DECLARE_ASM_CONST(16, uint32_t, SSE_32BIT_MAP[4]) = { [0 ... 3] = 0x102 }; |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
70 |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
71 #endif // HAVE_SSE4 |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
72 |
24969
c2b7ba444ade
begin moving const filter data to .text/.rodata sections
rfelker
parents:
24545
diff
changeset
|
73 static const struct vf_priv_s { |
32096 | 74 int outh, outw; |
18937 | 75 |
35039
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
76 int is_planar; |
32096 | 77 unsigned int outfmt; |
18937 | 78 |
32096 | 79 // 1 = auto-added filter: insert only if chain does not support EOSD already |
80 // 0 = insert always | |
81 int auto_insert; | |
18937 | 82 |
35244 | 83 // planar data to be directly rendered on frames |
84 uint8_t *planes[MP_MAX_PLANES]; | |
85 // alpha here is actually transparency, not opacity | |
86 uint8_t *alphas[MP_MAX_PLANES]; | |
87 struct dirty_rows_extent { | |
88 int xmin, xmax; | |
89 } *dirty_rows; | |
35039
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
90 |
35244 | 91 // called for every eosd image when subtitle is changed |
92 void (*draw_image)(vf_instance_t *, struct mp_eosd_image *); | |
93 // called for every time subtitle is changed | |
94 void (*prepare_buffer)(vf_instance_t *); | |
95 // called for every frame | |
96 void (*render_frame)(vf_instance_t *); | |
24969
c2b7ba444ade
begin moving const filter data to .text/.rodata sections
rfelker
parents:
24545
diff
changeset
|
97 } vf_priv_dflt; |
18937 | 98 |
35244 | 99 static void draw_image_yuv(vf_instance_t *vf, struct mp_eosd_image *img) |
100 { | |
101 uint32_t color = img->color; | |
102 uint32_t opacity = 0xFF - _a(color); | |
103 uint8_t y = rgba2y(color), | |
104 u = rgba2u(color), | |
105 v = rgba2v(color); | |
35268 | 106 int outw = vf->priv->outw; |
35244 | 107 uint8_t *alpha = vf->priv->alphas[0], |
108 *dst_y = vf->priv->planes[0], | |
109 *dst_u = vf->priv->planes[1], | |
110 *dst_v = vf->priv->planes[2]; | |
111 struct dirty_rows_extent *dirty_rows = vf->priv->dirty_rows; | |
112 int src_x = img->dst_x, src_w = img->w, | |
113 src_y = img->dst_y, src_h = img->h, | |
114 stride = img->stride; | |
115 uint8_t *src = img->bitmap; | |
116 int i, j; | |
117 | |
118 opacity = MAP_24BIT(opacity); | |
119 for (i = 0; i < src_h; i++) { | |
120 struct dirty_rows_extent *dirty_row = &dirty_rows[src_y + i]; | |
121 dirty_row->xmin = FFMIN(dirty_row->xmin, src_x); | |
122 dirty_row->xmax = FFMAX(dirty_row->xmax, src_x + src_w); | |
123 | |
124 for (j = 0; j < src_w; j++) { | |
125 uint32_t k = src[i * stride + j]; | |
126 if (k) { | |
127 size_t p = (src_y + i) * outw + src_x + j; | |
128 k *= opacity; | |
129 alpha[p] = RSHIFT((0xFFFFFF - k) * alpha[p], 24); | |
130 dst_y[p] = RSHIFT((0xFFFFFF - k) * dst_y[p] + k * y, 24); | |
131 dst_u[p] = RSHIFT((0xFFFFFF - k) * dst_u[p] + k * u, 24); | |
132 dst_v[p] = RSHIFT((0xFFFFFF - k) * dst_v[p] + k * v, 24); | |
133 } | |
134 } | |
135 } | |
136 } | |
137 | |
138 static void prepare_buffer_422(vf_instance_t *vf) | |
139 { | |
140 uint8_t *dst_u = vf->priv->planes[1], | |
141 *dst_v = vf->priv->planes[2]; | |
142 int outw = vf->priv->outw, | |
143 outh = vf->priv->outh; | |
144 struct dirty_rows_extent *dirty_rows = vf->priv->dirty_rows; | |
145 int i, j; | |
146 | |
147 for (i = 0; i < outh; i++) { | |
148 int xmin = dirty_rows[i].xmin & ~1, | |
149 xmax = dirty_rows[i].xmax; | |
150 for (j = xmin; j < xmax; j += 2) { | |
151 size_t p = i * outw + j; | |
152 dst_u[p] = (dst_u[p] + dst_u[p + 1]) / 2; | |
153 dst_v[p] = (dst_v[p] + dst_v[p + 1]) / 2; | |
35575
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
154 dst_u[p + 1] = dst_v[p + 1] = 0; |
35244 | 155 } |
156 } | |
157 } | |
158 | |
159 static void render_frame_yuv422(vf_instance_t *vf) | |
160 { | |
161 uint8_t *alpha = vf->priv->alphas[0]; | |
162 uint8_t *src_y = vf->priv->planes[0], | |
163 *src_u = vf->priv->planes[1], | |
164 *src_v = vf->priv->planes[2]; | |
165 int outw = vf->priv->outw, | |
166 outh = vf->priv->outh; | |
167 struct dirty_rows_extent *dirty_rows = vf->priv->dirty_rows; | |
168 uint8_t *dest = vf->dmpi->planes[0]; | |
169 int stride = vf->dmpi->stride[0]; | |
170 int is_uyvy = vf->priv->outfmt == IMGFMT_UYVY; | |
171 int i, j; | |
172 | |
173 for (i = 0; i < outh; i++) { | |
174 int xmin = dirty_rows[i].xmin & ~1, | |
175 xmax = dirty_rows[i].xmax; | |
176 for (j = xmin; j < xmax; j += 2) { | |
177 size_t src = i * outw + j, | |
178 dst = i * stride + j * 2; | |
179 uint_fast16_t a0 = alpha[src], | |
180 a1 = alpha[src + 1]; | |
181 uint8_t y0, y1, u, v; | |
182 | |
183 if (a0 == 0xFF && a1 == 0xFF) | |
184 continue; | |
185 | |
186 y0 = dest[dst + is_uyvy + 0]; | |
187 y1 = dest[dst + is_uyvy + 2]; | |
188 u = dest[dst - is_uyvy + 1]; | |
189 v = dest[dst - is_uyvy + 3]; | |
190 | |
191 a0 = MAP_16BIT(a0); | |
192 a1 = MAP_16BIT(a1); | |
193 y0 = ((a0 * y0) >> 8) + src_y[src]; | |
194 y1 = ((a1 * y1) >> 8) + src_y[src + 1]; | |
195 | |
196 a0 = (a0 + a1) / 2; | |
197 u = ((a0 * u) >> 8) + src_u[src]; | |
198 v = ((a0 * v) >> 8) + src_v[src]; | |
199 | |
200 dest[dst + is_uyvy + 0] = y0; | |
201 dest[dst + is_uyvy + 2] = y1; | |
202 dest[dst - is_uyvy + 1] = u; | |
203 dest[dst - is_uyvy + 3] = v; | |
204 } | |
205 } | |
206 } | |
207 | |
35575
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
208 #if HAVE_SSE4 |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
209 |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
210 static void render_frame_yuv422_sse4(vf_instance_t *vf) |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
211 { |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
212 uint8_t *alpha = vf->priv->alphas[0]; |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
213 uint8_t *src_y = vf->priv->planes[0], |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
214 *src_u = vf->priv->planes[1], |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
215 *src_v = vf->priv->planes[2]; |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
216 int outw = vf->priv->outw, |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
217 outh = vf->priv->outh; |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
218 struct dirty_rows_extent *dr = vf->priv->dirty_rows; |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
219 uint8_t *dst = vf->dmpi->planes[0]; |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
220 int stride = vf->dmpi->stride[0]; |
35579
6b169870ae30
Reduce register usage to fix the compilation in x86.
upsuper
parents:
35576
diff
changeset
|
221 int32_t is_uyvy = vf->priv->outfmt == IMGFMT_UYVY; |
35575
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
222 int i; |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
223 |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
224 CLEAN_XMM(7); |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
225 |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
226 for (i = 0; i < outh; i++) { |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
227 size_t xmin = dr[i].xmin & ~7, |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
228 xmax = dr[i].xmax; |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
229 __asm__ volatile ( |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
230 "jmp 4f \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
231 "1: \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
232 |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
233 "cmpl $-1, 0(%[alpha], %[j], 1) \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
234 "jne 2f \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
235 "cmpl $-1, 4(%[alpha], %[j], 1) \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
236 "jne 2f \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
237 "jmp 3f \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
238 |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
239 "2: \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
240 "movq (%[alpha], %[j], 1), %%xmm0 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
241 "punpcklbw %%xmm7, %%xmm0 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
242 "movdqa %%xmm0, %%xmm1 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
243 "punpcklwd %%xmm7, %%xmm0 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
244 "punpckhwd %%xmm7, %%xmm1 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
245 "pmulld "MANGLE(SSE_32BIT_MAP)", %%xmm0 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
246 "pmulld "MANGLE(SSE_32BIT_MAP)", %%xmm1 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
247 "paddd "MANGLE(SSE_32BIT_80H)", %%xmm0 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
248 "paddd "MANGLE(SSE_32BIT_80H)", %%xmm1 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
249 "psrld $8, %%xmm0 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
250 "psrld $8, %%xmm1 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
251 "movdqa %%xmm0, %%xmm2 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
252 "movdqa %%xmm1, %%xmm3 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
253 "packssdw %%xmm1, %%xmm0 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
254 "phaddd %%xmm3, %%xmm2 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
255 "psrld $1, %%xmm2 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
256 "packusdw %%xmm7, %%xmm2 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
257 "punpcklwd %%xmm2, %%xmm2 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
258 |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
259 "movdqu (%[dst], %[j], 2), %%xmm1 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
260 "movdqa %%xmm1, %%xmm3 \n\t" |
35579
6b169870ae30
Reduce register usage to fix the compilation in x86.
upsuper
parents:
35576
diff
changeset
|
261 "cmpl $0, %[f] \n\t" |
6b169870ae30
Reduce register usage to fix the compilation in x86.
upsuper
parents:
35576
diff
changeset
|
262 "je 11f \n\t" |
35575
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
263 "psrlw $8, %%xmm1 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
264 "psllw $8, %%xmm3 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
265 "psrlw $8, %%xmm3 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
266 "jmp 12f \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
267 "11: \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
268 "psllw $8, %%xmm1 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
269 "psrlw $8, %%xmm1 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
270 "psrlw $8, %%xmm3 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
271 "12: \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
272 "pmullw %%xmm0, %%xmm1 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
273 "pmullw %%xmm2, %%xmm3 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
274 "psrlw $8, %%xmm1 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
275 "psrlw $8, %%xmm3 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
276 "packuswb %%xmm7, %%xmm1 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
277 "packuswb %%xmm7, %%xmm3 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
278 "movq (%[src_y], %[j], 1), %%xmm4 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
279 "movq (%[src_u], %[j], 1), %%xmm5 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
280 "movq (%[src_v], %[j], 1), %%xmm6 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
281 "packuswb %%xmm7, %%xmm5 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
282 "packuswb %%xmm7, %%xmm6 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
283 "punpcklbw %%xmm6, %%xmm5 \n\t" |
35579
6b169870ae30
Reduce register usage to fix the compilation in x86.
upsuper
parents:
35576
diff
changeset
|
284 "cmpl $0, %[f] \n\t" |
6b169870ae30
Reduce register usage to fix the compilation in x86.
upsuper
parents:
35576
diff
changeset
|
285 "je 21f \n\t" |
35575
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
286 "punpcklbw %%xmm1, %%xmm3 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
287 "punpcklbw %%xmm4, %%xmm5 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
288 "paddb %%xmm5, %%xmm3 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
289 "movdqu %%xmm3, (%[dst], %[j], 2) \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
290 "jmp 22f \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
291 "21: \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
292 "punpcklbw %%xmm3, %%xmm1 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
293 "punpcklbw %%xmm5, %%xmm4 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
294 "paddb %%xmm4, %%xmm1 \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
295 "movdqu %%xmm1, (%[dst], %[j], 2) \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
296 "22: \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
297 |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
298 "3: \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
299 "add $8, %[j] \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
300 "4: \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
301 "cmp %[xmax], %[j] \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
302 "jl 1b \n\t" |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
303 |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
304 : : [dst] "r" (dst + i * stride), |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
305 [alpha] "r" (alpha + i * outw), |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
306 [src_y] "r" (src_y + i * outw), |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
307 [src_u] "r" (src_u + i * outw), |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
308 [src_v] "r" (src_v + i * outw), |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
309 [j] "r" (xmin), |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
310 [xmax] "g" (xmax), |
35579
6b169870ae30
Reduce register usage to fix the compilation in x86.
upsuper
parents:
35576
diff
changeset
|
311 [f] "g" (is_uyvy) |
35575
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
312 ); |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
313 } |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
314 } |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
315 |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
316 #endif // HAVE_SSE4 |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
317 |
35244 | 318 static void prepare_buffer_420p(vf_instance_t *vf) |
319 { | |
320 int outw = vf->priv->outw, | |
321 outh = vf->priv->outh; | |
322 uint8_t *dst_u = vf->priv->planes[1], | |
323 *dst_v = vf->priv->planes[2]; | |
324 uint8_t *src_a = vf->priv->alphas[0], | |
325 *dst_a = vf->priv->alphas[1]; | |
326 struct dirty_rows_extent *dirty_rows = vf->priv->dirty_rows; | |
327 int i, j; | |
328 | |
329 for (i = 0; i < outh; i += 2) { | |
330 int xmin = FFMIN(dirty_rows[i].xmin, dirty_rows[i + 1].xmin) & ~1, | |
331 xmax = FFMAX(dirty_rows[i].xmax, dirty_rows[i + 1].xmax); | |
332 for (j = xmin; j < xmax; j += 2) { | |
333 size_t p = i * outw / 4 + j / 2, | |
334 q1 = i * outw + j, | |
335 q2 = q1 + outw; | |
336 dst_a[p] = (src_a[q1] + src_a[q1 + 1] + | |
337 src_a[q2] + src_a[q2 + 1] + 2) / 4; | |
338 dst_u[p] = (dst_u[q1] + dst_u[q1 + 1] + | |
339 dst_u[q2] + dst_u[q2 + 1] + 2) / 4; | |
340 dst_v[p] = (dst_v[q1] + dst_v[q1 + 1] + | |
341 dst_v[q2] + dst_v[q2 + 1] + 2) / 4; | |
342 } | |
343 } | |
35576
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
344 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
345 #if HAVE_SSE4 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
346 // for render_frame_yuv420p_sse4 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
347 if (gCpuCaps.hasSSE4 && outw % 32 == 0) { |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
348 for (i = 0; i < outh; i += 2) { |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
349 int xmin = FFMIN(dirty_rows[i].xmin, dirty_rows[i + 1].xmin) & ~1, |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
350 xmax = FFMAX(dirty_rows[i].xmax, dirty_rows[i + 1].xmax); |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
351 if (xmin >= xmax) |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
352 continue; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
353 for (j = xmin & ~31; j < xmin; j += 2) { |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
354 size_t p = i * outw / 4 + j / 2; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
355 dst_a[p] = 0xFF; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
356 dst_u[p] = dst_v[p] = 0; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
357 } |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
358 for (j = xmax; j < FFALIGN(xmax, 32); j += 2) { |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
359 size_t p = i * outw / 4 + j / 2; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
360 dst_a[p] = 0xFF; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
361 dst_u[p] = dst_v[p] = 0; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
362 } |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
363 } |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
364 } |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
365 #endif // HAVE_SSE4 |
35244 | 366 } |
367 | |
368 static void render_frame_yuv420p(vf_instance_t *vf) | |
369 { | |
370 uint8_t **planes = vf->priv->planes; | |
371 uint8_t **dest = vf->dmpi->planes; | |
372 struct dirty_rows_extent *dirty_rows = vf->priv->dirty_rows; | |
373 uint8_t *alpha; | |
35268 | 374 uint8_t *src_y = planes[0], |
375 *src_u = planes[1], | |
376 *src_v = planes[2]; | |
377 uint8_t *dst_y = dest[0], | |
378 *dst_u = dest[1], | |
379 *dst_v = dest[2]; | |
35244 | 380 int stride; |
381 int outw = vf->priv->outw, | |
382 outh = vf->priv->outh; | |
383 int i, j; | |
384 | |
385 // y | |
386 alpha = vf->priv->alphas[0]; | |
387 stride = vf->dmpi->stride[0]; | |
388 for (i = 0; i < outh; i++) { | |
389 int xmin = dirty_rows[i].xmin, | |
390 xmax = dirty_rows[i].xmax; | |
391 for (j = xmin; j < xmax; j++) { | |
392 size_t s = i * outw + j, | |
393 d = i * stride + j; | |
394 if (alpha[s] != 0xFF) | |
395 dst_y[d] = ((MAP_16BIT(alpha[s]) * dst_y[d]) >> 8) + src_y[s]; | |
396 } | |
397 } | |
398 | |
399 // u & v | |
400 alpha = vf->priv->alphas[1]; | |
401 stride = vf->dmpi->stride[1]; | |
402 for (i = 0; i < outh / 2; i++) { | |
403 int xmin = FFMIN(dirty_rows[i * 2].xmin, dirty_rows[i * 2 + 1].xmin), | |
404 xmax = FFMAX(dirty_rows[i * 2].xmax, dirty_rows[i * 2 + 1].xmax); | |
405 for (j = xmin / 2; j < (xmax + 1) / 2; j++) { | |
406 size_t s = i * outw / 2 + j, | |
407 d = i * stride + j; | |
408 if (alpha[s] != 0xFF) { | |
409 uint_fast16_t a = MAP_16BIT(alpha[s]); | |
410 dst_u[d] = ((a * dst_u[d]) >> 8) + src_u[s]; | |
411 dst_v[d] = ((a * dst_v[d]) >> 8) + src_v[s]; | |
412 } | |
413 } | |
414 } | |
415 } | |
416 | |
35576
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
417 #if HAVE_SSE4 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
418 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
419 static void render_frame_yuv420p_sse4(vf_instance_t *vf) |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
420 { |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
421 struct dirty_rows_extent *dr = vf->priv->dirty_rows; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
422 uint8_t *alpha; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
423 uint8_t *src_y = vf->priv->planes[0], |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
424 *src_u = vf->priv->planes[1], |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
425 *src_v = vf->priv->planes[2]; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
426 uint8_t *dst_y = vf->dmpi->planes[0], |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
427 *dst_u = vf->dmpi->planes[1], |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
428 *dst_v = vf->dmpi->planes[2]; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
429 int stride; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
430 int outw = vf->priv->outw, |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
431 outh = vf->priv->outh; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
432 int i; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
433 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
434 CLEAN_XMM(7); |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
435 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
436 #define CHECK_16_ALPHA \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
437 "cmpl $-1, 0(%[alpha], %[j], 1) \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
438 "jne 2f \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
439 "cmpl $-1, 4(%[alpha], %[j], 1) \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
440 "jne 2f \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
441 "cmpl $-1, 8(%[alpha], %[j], 1) \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
442 "jne 2f \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
443 "cmpl $-1, 12(%[alpha], %[j], 1) \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
444 "jne 2f \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
445 "jmp 3f \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
446 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
447 #define MAP_16_ALPHA \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
448 "movq 0(%[alpha], %[j], 1), %%xmm0 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
449 "movq 8(%[alpha], %[j], 1), %%xmm2 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
450 "punpcklbw %%xmm7, %%xmm0 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
451 "punpcklbw %%xmm7, %%xmm2 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
452 "movdqa %%xmm0, %%xmm1 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
453 "movdqa %%xmm2, %%xmm3 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
454 "punpcklwd %%xmm7, %%xmm0 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
455 "punpckhwd %%xmm7, %%xmm1 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
456 "punpcklwd %%xmm7, %%xmm2 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
457 "punpckhwd %%xmm7, %%xmm3 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
458 "pmulld "MANGLE(SSE_32BIT_MAP)", %%xmm0 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
459 "pmulld "MANGLE(SSE_32BIT_MAP)", %%xmm1 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
460 "pmulld "MANGLE(SSE_32BIT_MAP)", %%xmm2 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
461 "pmulld "MANGLE(SSE_32BIT_MAP)", %%xmm3 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
462 "paddd "MANGLE(SSE_32BIT_80H)", %%xmm0 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
463 "paddd "MANGLE(SSE_32BIT_80H)", %%xmm1 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
464 "paddd "MANGLE(SSE_32BIT_80H)", %%xmm2 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
465 "paddd "MANGLE(SSE_32BIT_80H)", %%xmm3 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
466 "psrld $8, %%xmm0 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
467 "psrld $8, %%xmm1 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
468 "psrld $8, %%xmm2 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
469 "psrld $8, %%xmm3 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
470 "packssdw %%xmm1, %%xmm0 \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
471 "packssdw %%xmm3, %%xmm2 \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
472 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
473 #define MUL_ALPHA(dst, xmm1, xmm2) \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
474 "movq 0(%["#dst"], %[j], 1), %%"#xmm1" \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
475 "movq 8(%["#dst"], %[j], 1), %%"#xmm2" \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
476 "punpcklbw %%xmm7, %%"#xmm1" \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
477 "punpcklbw %%xmm7, %%"#xmm2" \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
478 "pmullw %%xmm0, %%"#xmm1" \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
479 "pmullw %%xmm2, %%"#xmm2" \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
480 "psrlw $8, %%"#xmm1" \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
481 "psrlw $8, %%"#xmm2" \n\t" \ |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
482 "packuswb %%"#xmm2", %%"#xmm1" \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
483 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
484 // y |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
485 alpha = vf->priv->alphas[0]; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
486 stride = vf->dmpi->stride[0]; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
487 for (i = 0; i < outh; i++) { |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
488 size_t xmin = dr[i].xmin & ~15, |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
489 xmax = dr[i].xmax; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
490 __asm__ volatile ( |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
491 "jmp 4f \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
492 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
493 "1: \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
494 CHECK_16_ALPHA |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
495 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
496 "2: \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
497 MAP_16_ALPHA |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
498 MUL_ALPHA(dst_y, xmm1, xmm3) |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
499 "movdqa (%[src_y], %[j], 1), %%xmm0 \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
500 "paddb %%xmm0, %%xmm1 \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
501 "movdqu %%xmm1, (%[dst_y], %[j], 1) \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
502 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
503 "3: \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
504 "add $16, %[j] \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
505 "4: \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
506 "cmp %[xmax], %[j] \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
507 "jl 1b \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
508 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
509 : : [j] "r" (xmin), |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
510 [xmax] "g" (xmax), |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
511 [alpha] "r" (alpha + i * outw), |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
512 [src_y] "r" (src_y + i * outw), |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
513 [dst_y] "r" (dst_y + i * stride) |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
514 ); |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
515 } |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
516 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
517 // u & v |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
518 alpha = vf->priv->alphas[1]; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
519 stride = vf->dmpi->stride[1]; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
520 for (i = 0; i < outh / 2; i++) { |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
521 size_t xmin = FFMIN(dr[i * 2].xmin, dr[i * 2 + 1].xmin) & ~31, |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
522 xmax = FFMAX(dr[i * 2].xmax, dr[i * 2 + 1].xmax); |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
523 __asm__ volatile ( |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
524 "jmp 4f \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
525 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
526 "1: \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
527 CHECK_16_ALPHA |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
528 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
529 "2: \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
530 MAP_16_ALPHA |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
531 MUL_ALPHA(dst_u, xmm1, xmm4) |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
532 MUL_ALPHA(dst_v, xmm3, xmm5) |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
533 "movdqa (%[src_u], %[j], 1), %%xmm0 \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
534 "movdqa (%[src_v], %[j], 1), %%xmm2 \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
535 "paddb %%xmm0, %%xmm1 \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
536 "paddb %%xmm2, %%xmm3 \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
537 "movdqu %%xmm1, (%[dst_u], %[j], 1) \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
538 "movdqu %%xmm3, (%[dst_v], %[j], 1) \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
539 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
540 "3: \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
541 "add $16, %[j] \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
542 "4: \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
543 "cmp %[xmax], %[j] \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
544 "jl 1b \n\t" |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
545 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
546 : : [j] "r" (xmin / 2), |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
547 [xmax] "g" ((xmax + 1) / 2), |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
548 [alpha] "r" (alpha + i * outw / 2), |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
549 [src_u] "r" (src_u + i * outw / 2), |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
550 [src_v] "r" (src_v + i * outw / 2), |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
551 [dst_u] "r" (dst_u + i * stride), |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
552 [dst_v] "r" (dst_v + i * stride) |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
553 ); |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
554 } |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
555 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
556 #undef CHECK_16_ALPHA |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
557 #undef MAP_16_ALPHA |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
558 #undef MUL_ALPHA |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
559 } |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
560 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
561 #endif // HAVE_SSE4 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
562 |
35244 | 563 static void clean_buffer(vf_instance_t *vf) |
564 { | |
565 int outw = vf->priv->outw, | |
566 outh = vf->priv->outh; | |
567 struct dirty_rows_extent *dirty_rows = vf->priv->dirty_rows; | |
568 uint8_t **planes = vf->priv->planes; | |
569 uint8_t *alpha = vf->priv->alphas[0]; | |
570 int i, j; | |
571 | |
572 for (i = 0; i < MP_MAX_PLANES; i++) { | |
573 uint8_t *plane = planes[i]; | |
574 if (!plane) | |
575 break; | |
576 for (j = 0; j < outh; j++) { | |
577 int xmin = dirty_rows[j].xmin; | |
578 int width = dirty_rows[j].xmax - xmin; | |
579 if (width > 0) | |
580 memset(plane + j * outw + xmin, 0, width); | |
581 } | |
582 } | |
583 for (i = 0; i < outh; i++) { | |
584 int xmin = dirty_rows[i].xmin; | |
585 int width = dirty_rows[i].xmax - xmin; | |
586 if (width > 0) | |
587 memset(alpha + i * outw + xmin, -1, width); | |
588 } | |
589 for (i = 0; i < outh; i++) { | |
590 dirty_rows[i].xmin = outw; | |
591 dirty_rows[i].xmax = 0; | |
592 } | |
593 } | |
18937 | 594 |
30642
a972c1a4a012
cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents:
30638
diff
changeset
|
595 static int config(struct vf_instance *vf, |
32096 | 596 int width, int height, int d_width, int d_height, |
597 unsigned int flags, unsigned int outfmt) | |
18937 | 598 { |
32391
b4c3659d16b1
Use a dynamic list for the sources of EOSD elements.
cigaes
parents:
32261
diff
changeset
|
599 struct mp_eosd_settings res = {0}; |
35244 | 600 struct dirty_rows_extent *dirty_rows; |
601 int outw, outh; | |
602 int planes, alphas; | |
603 int i; | |
31927 | 604 |
35574 | 605 vf->priv->outfmt = outfmt; |
606 vf->priv->outh = outh = height + ass_top_margin + ass_bottom_margin; | |
607 vf->priv->outw = outw = width; | |
608 | |
35039
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
609 switch (outfmt) { |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
610 case IMGFMT_YV12: |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
611 case IMGFMT_I420: |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
612 case IMGFMT_IYUV: |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
613 vf->priv->is_planar = 1; |
35244 | 614 planes = 3; |
615 alphas = 2; | |
616 vf->priv->draw_image = draw_image_yuv; | |
617 vf->priv->render_frame = render_frame_yuv420p; | |
618 vf->priv->prepare_buffer = prepare_buffer_420p; | |
35576
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
619 #if HAVE_SSE4 |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
620 if (gCpuCaps.hasSSE4 && outw % 32 == 0) |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
621 vf->priv->render_frame = render_frame_yuv420p_sse4; |
2508973357da
Accelerate ass rendering by using SSE4 for yuv420p.
upsuper
parents:
35575
diff
changeset
|
622 #endif |
35039
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
623 break; |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
624 case IMGFMT_UYVY: |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
625 case IMGFMT_YUY2: |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
626 vf->priv->is_planar = 0; |
35244 | 627 planes = 3; |
628 alphas = 1; | |
629 vf->priv->draw_image = draw_image_yuv; | |
630 vf->priv->render_frame = render_frame_yuv422; | |
631 vf->priv->prepare_buffer = prepare_buffer_422; | |
35575
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
632 #if HAVE_SSE4 |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
633 if (gCpuCaps.hasSSE4 && outw % 8 == 0) |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
634 vf->priv->render_frame = render_frame_yuv422_sse4; |
db12239148e9
Accelerate ass rendering by using SSE4 for yuv422.
upsuper
parents:
35574
diff
changeset
|
635 #endif |
35039
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
636 break; |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
637 default: |
32096 | 638 return 0; |
35039
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
639 } |
18937 | 640 |
32096 | 641 if (!opt_screen_size_x && !opt_screen_size_y) { |
642 d_width = d_width * vf->priv->outw / width; | |
643 d_height = d_height * vf->priv->outh / height; | |
644 } | |
18937 | 645 |
35244 | 646 for (i = 0; i < planes; i++) |
647 vf->priv->planes[i] = av_malloc(outw * outh); | |
648 for (i = 0; i < alphas; i++) | |
649 vf->priv->alphas[i] = av_malloc(outw * outh); | |
650 dirty_rows = av_malloc(outh * sizeof(*dirty_rows)); | |
651 // mark all rows dirty here | |
652 // so that they can be properly cleaned in clear_buffer() | |
653 for (i = 0; i < outh; i++) { | |
654 dirty_rows[i].xmin = 0; | |
655 dirty_rows[i].xmax = outw; | |
656 } | |
657 vf->priv->dirty_rows = dirty_rows; | |
658 clean_buffer(vf); | |
29263
0f1b5b68af32
whitespace cosmetics: Remove all trailing whitespace.
diego
parents:
26727
diff
changeset
|
659 |
32096 | 660 res.w = vf->priv->outw; |
661 res.h = vf->priv->outh; | |
662 res.srcw = width; | |
663 res.srch = height; | |
664 res.mt = ass_top_margin; | |
665 res.mb = ass_bottom_margin; | |
32391
b4c3659d16b1
Use a dynamic list for the sources of EOSD elements.
cigaes
parents:
32261
diff
changeset
|
666 eosd_configure(&res); |
18937 | 667 |
32096 | 668 return vf_next_config(vf, vf->priv->outw, vf->priv->outh, d_width, |
669 d_height, flags, outfmt); | |
18937 | 670 } |
671 | |
30642
a972c1a4a012
cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents:
30638
diff
changeset
|
672 static void get_image(struct vf_instance *vf, mp_image_t *mpi) |
18937 | 673 { |
32096 | 674 if (mpi->type == MP_IMGTYPE_IPB) |
675 return; | |
676 if (mpi->flags & MP_IMGFLAG_PRESERVE) | |
677 return; | |
678 if (mpi->imgfmt != vf->priv->outfmt) | |
679 return; // colorspace differ | |
29263
0f1b5b68af32
whitespace cosmetics: Remove all trailing whitespace.
diego
parents:
26727
diff
changeset
|
680 |
32096 | 681 // width never changes, always try full DR |
682 mpi->priv = vf->dmpi = vf_get_image(vf->next, mpi->imgfmt, mpi->type, | |
683 mpi->flags | MP_IMGFLAG_READABLE, | |
34882
649d4cad4619
Request a sufficiently large image for direct rendering.
reimar
parents:
34863
diff
changeset
|
684 FFMAX(mpi->width, vf->priv->outw), |
649d4cad4619
Request a sufficiently large image for direct rendering.
reimar
parents:
34863
diff
changeset
|
685 FFMAX(mpi->height, vf->priv->outh)); |
18937 | 686 |
32096 | 687 if ( (vf->dmpi->flags & MP_IMGFLAG_DRAW_CALLBACK) && |
688 !(vf->dmpi->flags & MP_IMGFLAG_DIRECT)) { | |
689 mp_msg(MSGT_ASS, MSGL_INFO, MSGTR_MPCODECS_FullDRNotPossible); | |
690 return; | |
691 } | |
692 // set up mpi as a cropped-down image of dmpi: | |
693 if (mpi->flags & MP_IMGFLAG_PLANAR) { | |
694 mpi->planes[0] = vf->dmpi->planes[0] + ass_top_margin * vf->dmpi->stride[0]; | |
695 mpi->planes[1] = vf->dmpi->planes[1] + (ass_top_margin >> mpi->chroma_y_shift) * vf->dmpi->stride[1]; | |
696 mpi->planes[2] = vf->dmpi->planes[2] + (ass_top_margin >> mpi->chroma_y_shift) * vf->dmpi->stride[2]; | |
697 mpi->stride[1] = vf->dmpi->stride[1]; | |
698 mpi->stride[2] = vf->dmpi->stride[2]; | |
699 } else { | |
700 mpi->planes[0] = vf->dmpi->planes[0] + ass_top_margin * vf->dmpi->stride[0]; | |
701 } | |
702 mpi->stride[0] = vf->dmpi->stride[0]; | |
703 mpi->width = vf->dmpi->width; | |
704 mpi->flags |= MP_IMGFLAG_DIRECT; | |
705 mpi->flags &= ~MP_IMGFLAG_DRAW_CALLBACK; | |
706 // vf->dmpi->flags &= ~MP_IMGFLAG_DRAW_CALLBACK; | |
18937 | 707 } |
708 | |
709 static void blank(mp_image_t *mpi, int y1, int y2) | |
710 { | |
32096 | 711 int color[3] = { 16, 128, 128 }; // black (YUV) |
712 int y; | |
713 unsigned char *dst; | |
714 int chroma_rows = (y2 - y1) >> mpi->chroma_y_shift; | |
18937 | 715 |
35039
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
716 if (mpi->flags & MP_IMGFLAG_PLANAR) { |
35436 | 717 dst = mpi->planes[0] + y1 * mpi->stride[0]; |
718 for (y = 0; y < y2 - y1; ++y) { | |
719 memset(dst, color[0], mpi->w); | |
720 dst += mpi->stride[0]; | |
721 } | |
722 dst = mpi->planes[1] + (y1 >> mpi->chroma_y_shift) * mpi->stride[1]; | |
723 for (y = 0; y < chroma_rows; ++y) { | |
724 memset(dst, color[1], mpi->chroma_width); | |
725 dst += mpi->stride[1]; | |
726 } | |
727 dst = mpi->planes[2] + (y1 >> mpi->chroma_y_shift) * mpi->stride[2]; | |
728 for (y = 0; y < chroma_rows; ++y) { | |
729 memset(dst, color[2], mpi->chroma_width); | |
730 dst += mpi->stride[2]; | |
731 } | |
35039
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
732 } else { |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
733 unsigned char packed_color[4]; |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
734 int x; |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
735 if (mpi->imgfmt == IMGFMT_UYVY) { |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
736 packed_color[0] = color[1]; |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
737 packed_color[1] = color[0]; |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
738 packed_color[2] = color[2]; |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
739 packed_color[3] = color[0]; |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
740 } else { |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
741 packed_color[0] = color[0]; |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
742 packed_color[1] = color[1]; |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
743 packed_color[2] = color[0]; |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
744 packed_color[3] = color[2]; |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
745 } |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
746 dst = mpi->planes[0] + y1 * mpi->stride[0]; |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
747 for (y = y1; y < y2; ++y) { |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
748 for (x = 0; x < mpi->w / 2; ++x) |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
749 AV_COPY32(dst + 4 * x, packed_color); |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
750 dst += mpi->stride[0]; |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
751 } |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
752 } |
18937 | 753 } |
754 | |
30642
a972c1a4a012
cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents:
30638
diff
changeset
|
755 static int prepare_image(struct vf_instance *vf, mp_image_t *mpi) |
18937 | 756 { |
32096 | 757 if (mpi->flags & MP_IMGFLAG_DIRECT || |
758 mpi->flags & MP_IMGFLAG_DRAW_CALLBACK) { | |
759 vf->dmpi = mpi->priv; | |
760 if (!vf->dmpi) { | |
761 mp_msg(MSGT_ASS, MSGL_WARN, MSGTR_MPCODECS_FunWhydowegetNULL); | |
762 return 0; | |
763 } | |
764 mpi->priv = NULL; | |
765 // we've used DR, so we're ready... | |
766 if (ass_top_margin) | |
767 blank(vf->dmpi, 0, ass_top_margin); | |
768 if (ass_bottom_margin) | |
769 blank(vf->dmpi, vf->priv->outh - ass_bottom_margin, vf->priv->outh); | |
770 if (!(mpi->flags & MP_IMGFLAG_PLANAR)) | |
771 vf->dmpi->planes[1] = mpi->planes[1]; // passthrough rgb8 palette | |
772 return 0; | |
773 } | |
774 // hope we'll get DR buffer: | |
775 vf->dmpi = vf_get_image(vf->next, vf->priv->outfmt, MP_IMGTYPE_TEMP, | |
776 MP_IMGFLAG_ACCEPT_STRIDE | MP_IMGFLAG_READABLE, | |
777 vf->priv->outw, vf->priv->outh); | |
18937 | 778 |
32096 | 779 // copy mpi->dmpi... |
780 if (mpi->flags & MP_IMGFLAG_PLANAR) { | |
781 memcpy_pic(vf->dmpi->planes[0] + ass_top_margin * vf->dmpi->stride[0], | |
35436 | 782 mpi->planes[0], |
783 mpi->w, | |
784 mpi->h, | |
785 vf->dmpi->stride[0], | |
786 mpi->stride[0]); | |
32096 | 787 memcpy_pic(vf->dmpi->planes[1] + (ass_top_margin >> mpi->chroma_y_shift) * vf->dmpi->stride[1], |
35436 | 788 mpi->planes[1], |
789 mpi->w >> mpi->chroma_x_shift, | |
32096 | 790 mpi->h >> mpi->chroma_y_shift, |
35436 | 791 vf->dmpi->stride[1], |
32096 | 792 mpi->stride[1]); |
793 memcpy_pic(vf->dmpi->planes[2] + (ass_top_margin >> mpi->chroma_y_shift) * vf->dmpi->stride[2], | |
35436 | 794 mpi->planes[2], |
32096 | 795 mpi->w >> mpi->chroma_x_shift, |
796 mpi->h >> mpi->chroma_y_shift, | |
35436 | 797 vf->dmpi->stride[2], |
32096 | 798 mpi->stride[2]); |
799 } else { | |
800 memcpy_pic(vf->dmpi->planes[0] + ass_top_margin * vf->dmpi->stride[0], | |
35436 | 801 mpi->planes[0], |
32096 | 802 mpi->w * (vf->dmpi->bpp / 8), |
35436 | 803 mpi->h, |
32096 | 804 vf->dmpi->stride[0], |
35436 | 805 mpi->stride[0]); |
32096 | 806 vf->dmpi->planes[1] = mpi->planes[1]; // passthrough rgb8 palette |
807 } | |
808 if (ass_top_margin) | |
809 blank(vf->dmpi, 0, ass_top_margin); | |
810 if (ass_bottom_margin) | |
811 blank(vf->dmpi, vf->priv->outh - ass_bottom_margin, vf->priv->outh); | |
812 return 0; | |
18937 | 813 } |
814 | |
35244 | 815 static void prepare_eosd(vf_instance_t *vf, struct mp_eosd_image_list *imgs) |
18937 | 816 { |
35244 | 817 struct mp_eosd_image *img = eosd_image_first(imgs); |
818 void (*draw_image)(vf_instance_t *, struct mp_eosd_image *); | |
35039
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
819 |
35244 | 820 clean_buffer(vf); |
821 draw_image = vf->priv->draw_image; | |
822 for (; img; img = eosd_image_next(imgs)) | |
823 draw_image(vf, img); | |
824 vf->priv->prepare_buffer(vf); | |
18937 | 825 } |
826 | |
30642
a972c1a4a012
cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents:
30638
diff
changeset
|
827 static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts) |
18937 | 828 { |
32391
b4c3659d16b1
Use a dynamic list for the sources of EOSD elements.
cigaes
parents:
32261
diff
changeset
|
829 struct mp_eosd_image_list images; |
b4c3659d16b1
Use a dynamic list for the sources of EOSD elements.
cigaes
parents:
32261
diff
changeset
|
830 eosd_render_frame(pts, &images); |
32096 | 831 prepare_image(vf, mpi); |
35244 | 832 if (images.changed) |
833 prepare_eosd(vf, &images); | |
834 vf->priv->render_frame(vf); | |
32096 | 835 return vf_next_put_image(vf, vf->dmpi, pts); |
18937 | 836 } |
837 | |
30642
a972c1a4a012
cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents:
30638
diff
changeset
|
838 static int query_format(struct vf_instance *vf, unsigned int fmt) |
18937 | 839 { |
32096 | 840 switch (fmt) { |
841 case IMGFMT_YV12: | |
842 case IMGFMT_I420: | |
843 case IMGFMT_IYUV: | |
35039
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
844 case IMGFMT_UYVY: |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
845 case IMGFMT_YUY2: |
35033 | 846 return vf_next_query_format(vf, fmt) | VFCAP_EOSD; |
32096 | 847 } |
848 return 0; | |
18937 | 849 } |
850 | |
851 static int control(vf_instance_t *vf, int request, void *data) | |
852 { | |
32096 | 853 switch (request) { |
854 case VFCTRL_INIT_EOSD: | |
855 return CONTROL_TRUE; | |
856 case VFCTRL_DRAW_EOSD: | |
857 return CONTROL_TRUE; | |
858 } | |
859 return vf_next_control(vf, request, data); | |
18937 | 860 } |
861 | |
30642
a972c1a4a012
cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents:
30638
diff
changeset
|
862 static void uninit(struct vf_instance *vf) |
18937 | 863 { |
35244 | 864 int i; |
865 for (i = 0; i < MP_MAX_PLANES; i++) | |
866 av_free(vf->priv->planes[i]); | |
867 for (i = 0; i < MP_MAX_PLANES; i++) | |
868 av_free(vf->priv->alphas[i]); | |
869 av_free(vf->priv->dirty_rows); | |
18937 | 870 } |
871 | |
32096 | 872 static const unsigned int fmt_list[] = { |
873 IMGFMT_YV12, | |
874 IMGFMT_I420, | |
875 IMGFMT_IYUV, | |
35039
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
876 IMGFMT_UYVY, |
848835e1b053
vf_ass: add support for rendering on YUY2 and UYVY images.
reimar
parents:
35033
diff
changeset
|
877 IMGFMT_YUY2, |
32096 | 878 0 |
18937 | 879 }; |
880 | |
30638
a7b908875c14
Rename open() vf initialization function to vf_open().
diego
parents:
30633
diff
changeset
|
881 static int vf_open(vf_instance_t *vf, char *args) |
18937 | 882 { |
32096 | 883 int flags; |
35033 | 884 unsigned outfmt = vf_match_csp(&vf->next, fmt_list, IMGFMT_YV12); |
885 if (outfmt) | |
886 flags = vf_next_query_format(vf, outfmt); | |
887 if (!outfmt || (vf->priv->auto_insert && flags & VFCAP_EOSD)) { | |
32096 | 888 uninit(vf); |
889 return 0; | |
890 } | |
29263
0f1b5b68af32
whitespace cosmetics: Remove all trailing whitespace.
diego
parents:
26727
diff
changeset
|
891 |
32096 | 892 if (vf->priv->auto_insert) |
893 mp_msg(MSGT_ASS, MSGL_INFO, "[ass] auto-open\n"); | |
29263
0f1b5b68af32
whitespace cosmetics: Remove all trailing whitespace.
diego
parents:
26727
diff
changeset
|
894 |
32096 | 895 vf->config = config; |
896 vf->query_format = query_format; | |
897 vf->uninit = uninit; | |
898 vf->control = control; | |
899 vf->get_image = get_image; | |
900 vf->put_image = put_image; | |
901 vf->default_caps = VFCAP_EOSD; | |
902 return 1; | |
18937 | 903 } |
904 | |
905 #define ST_OFF(f) M_ST_OFF(struct vf_priv_s,f) | |
24969
c2b7ba444ade
begin moving const filter data to .text/.rodata sections
rfelker
parents:
24545
diff
changeset
|
906 static const m_option_t vf_opts_fields[] = { |
32096 | 907 {"auto", ST_OFF(auto_insert), CONF_TYPE_FLAG, 0, 0, 1, NULL}, |
908 {NULL, NULL, 0, 0, 0, 0, NULL} | |
18937 | 909 }; |
910 | |
24969
c2b7ba444ade
begin moving const filter data to .text/.rodata sections
rfelker
parents:
24545
diff
changeset
|
911 static const m_struct_t vf_opts = { |
32096 | 912 "ass", |
913 sizeof(struct vf_priv_s), | |
914 &vf_priv_dflt, | |
915 vf_opts_fields | |
18937 | 916 }; |
917 | |
24969
c2b7ba444ade
begin moving const filter data to .text/.rodata sections
rfelker
parents:
24545
diff
changeset
|
918 const vf_info_t vf_info_ass = { |
32096 | 919 "Render ASS/SSA subtitles", |
920 "ass", | |
35244 | 921 "Evgeniy Stepanov, Xidorn Quan", |
32096 | 922 "", |
923 vf_open, | |
924 &vf_opts | |
18937 | 925 }; |