Mercurial > libavcodec.hg
annotate x86/h264dsp_mmx.c @ 12454:f4355cd85faa libavcodec
Port latest x264 deblock asm (before they moved to using NV12 as internal
format), LGPL'ed with permission from Jason and Loren. This includes mmx2
code, so remove inline asm from h264dsp_mmx.c accordingly.
author | rbultje |
---|---|
date | Fri, 03 Sep 2010 16:52:46 +0000 |
parents | 4c3e6ff1237e |
children | a5ddb39627fd |
rev | line source |
---|---|
8430 | 1 /* |
2 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt | |
3 * | |
4 * This file is part of FFmpeg. | |
5 * | |
6 * FFmpeg is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2.1 of the License, or (at your option) any later version. | |
10 * | |
11 * FFmpeg is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with FFmpeg; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 */ | |
20 | |
12450
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
21 #include "libavutil/x86_cpu.h" |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
22 #include "libavcodec/h264dsp.h" |
8430 | 23 #include "dsputil_mmx.h" |
24 | |
11369 | 25 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; |
26 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; | |
8430 | 27 |
28 /***********************************/ | |
29 /* IDCT */ | |
30 | |
31 #define SUMSUB_BADC( a, b, c, d ) \ | |
32 "paddw "#b", "#a" \n\t"\ | |
33 "paddw "#d", "#c" \n\t"\ | |
34 "paddw "#b", "#b" \n\t"\ | |
35 "paddw "#d", "#d" \n\t"\ | |
36 "psubw "#a", "#b" \n\t"\ | |
37 "psubw "#c", "#d" \n\t" | |
38 | |
39 #define SUMSUBD2_AB( a, b, t ) \ | |
40 "movq "#b", "#t" \n\t"\ | |
41 "psraw $1 , "#b" \n\t"\ | |
42 "paddw "#a", "#b" \n\t"\ | |
43 "psraw $1 , "#a" \n\t"\ | |
44 "psubw "#t", "#a" \n\t" | |
45 | |
46 #define IDCT4_1D( s02, s13, d02, d13, t ) \ | |
47 SUMSUB_BA ( s02, d02 )\ | |
48 SUMSUBD2_AB( s13, d13, t )\ | |
49 SUMSUB_BADC( d13, s02, s13, d02 ) | |
50 | |
51 #define STORE_DIFF_4P( p, t, z ) \ | |
52 "psraw $6, "#p" \n\t"\ | |
53 "movd (%0), "#t" \n\t"\ | |
54 "punpcklbw "#z", "#t" \n\t"\ | |
55 "paddsw "#t", "#p" \n\t"\ | |
56 "packuswb "#z", "#p" \n\t"\ | |
57 "movd "#p", (%0) \n\t" | |
58 | |
59 static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) | |
60 { | |
61 /* Load dct coeffs */ | |
62 __asm__ volatile( | |
63 "movq (%0), %%mm0 \n\t" | |
64 "movq 8(%0), %%mm1 \n\t" | |
65 "movq 16(%0), %%mm2 \n\t" | |
66 "movq 24(%0), %%mm3 \n\t" | |
67 :: "r"(block) ); | |
68 | |
69 __asm__ volatile( | |
70 /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */ | |
71 IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 ) | |
72 | |
73 "movq %0, %%mm6 \n\t" | |
74 /* in: 1,4,0,2 out: 1,2,3,0 */ | |
75 TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 ) | |
76 | |
77 "paddw %%mm6, %%mm3 \n\t" | |
78 | |
79 /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */ | |
80 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) | |
81 | |
82 "pxor %%mm7, %%mm7 \n\t" | |
83 :: "m"(ff_pw_32)); | |
84 | |
85 __asm__ volatile( | |
86 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) | |
87 "add %1, %0 \n\t" | |
88 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) | |
89 "add %1, %0 \n\t" | |
90 STORE_DIFF_4P( %%mm3, %%mm1, %%mm7) | |
91 "add %1, %0 \n\t" | |
92 STORE_DIFF_4P( %%mm4, %%mm1, %%mm7) | |
93 : "+r"(dst) | |
94 : "r" ((x86_reg)stride) | |
95 ); | |
96 } | |
97 | |
98 static inline void h264_idct8_1d(int16_t *block) | |
99 { | |
100 __asm__ volatile( | |
101 "movq 112(%0), %%mm7 \n\t" | |
102 "movq 80(%0), %%mm0 \n\t" | |
103 "movq 48(%0), %%mm3 \n\t" | |
104 "movq 16(%0), %%mm5 \n\t" | |
105 | |
106 "movq %%mm0, %%mm4 \n\t" | |
107 "movq %%mm5, %%mm1 \n\t" | |
108 "psraw $1, %%mm4 \n\t" | |
109 "psraw $1, %%mm1 \n\t" | |
110 "paddw %%mm0, %%mm4 \n\t" | |
111 "paddw %%mm5, %%mm1 \n\t" | |
112 "paddw %%mm7, %%mm4 \n\t" | |
113 "paddw %%mm0, %%mm1 \n\t" | |
114 "psubw %%mm5, %%mm4 \n\t" | |
115 "paddw %%mm3, %%mm1 \n\t" | |
116 | |
117 "psubw %%mm3, %%mm5 \n\t" | |
118 "psubw %%mm3, %%mm0 \n\t" | |
119 "paddw %%mm7, %%mm5 \n\t" | |
120 "psubw %%mm7, %%mm0 \n\t" | |
121 "psraw $1, %%mm3 \n\t" | |
122 "psraw $1, %%mm7 \n\t" | |
123 "psubw %%mm3, %%mm5 \n\t" | |
124 "psubw %%mm7, %%mm0 \n\t" | |
125 | |
126 "movq %%mm4, %%mm3 \n\t" | |
127 "movq %%mm1, %%mm7 \n\t" | |
128 "psraw $2, %%mm1 \n\t" | |
129 "psraw $2, %%mm3 \n\t" | |
130 "paddw %%mm5, %%mm3 \n\t" | |
131 "psraw $2, %%mm5 \n\t" | |
132 "paddw %%mm0, %%mm1 \n\t" | |
133 "psraw $2, %%mm0 \n\t" | |
134 "psubw %%mm4, %%mm5 \n\t" | |
135 "psubw %%mm0, %%mm7 \n\t" | |
136 | |
137 "movq 32(%0), %%mm2 \n\t" | |
138 "movq 96(%0), %%mm6 \n\t" | |
139 "movq %%mm2, %%mm4 \n\t" | |
140 "movq %%mm6, %%mm0 \n\t" | |
141 "psraw $1, %%mm4 \n\t" | |
142 "psraw $1, %%mm6 \n\t" | |
143 "psubw %%mm0, %%mm4 \n\t" | |
144 "paddw %%mm2, %%mm6 \n\t" | |
145 | |
146 "movq (%0), %%mm2 \n\t" | |
147 "movq 64(%0), %%mm0 \n\t" | |
148 SUMSUB_BA( %%mm0, %%mm2 ) | |
149 SUMSUB_BA( %%mm6, %%mm0 ) | |
150 SUMSUB_BA( %%mm4, %%mm2 ) | |
151 SUMSUB_BA( %%mm7, %%mm6 ) | |
152 SUMSUB_BA( %%mm5, %%mm4 ) | |
153 SUMSUB_BA( %%mm3, %%mm2 ) | |
154 SUMSUB_BA( %%mm1, %%mm0 ) | |
155 :: "r"(block) | |
156 ); | |
157 } | |
158 | |
159 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) | |
160 { | |
161 int i; | |
11369 | 162 DECLARE_ALIGNED(8, int16_t, b2)[64]; |
8430 | 163 |
164 block[0] += 32; | |
165 | |
166 for(i=0; i<2; i++){ | |
11369 | 167 DECLARE_ALIGNED(8, uint64_t, tmp); |
8430 | 168 |
169 h264_idct8_1d(block+4*i); | |
170 | |
171 __asm__ volatile( | |
172 "movq %%mm7, %0 \n\t" | |
173 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) | |
174 "movq %%mm0, 8(%1) \n\t" | |
175 "movq %%mm6, 24(%1) \n\t" | |
176 "movq %%mm7, 40(%1) \n\t" | |
177 "movq %%mm4, 56(%1) \n\t" | |
178 "movq %0, %%mm7 \n\t" | |
179 TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) | |
180 "movq %%mm7, (%1) \n\t" | |
181 "movq %%mm1, 16(%1) \n\t" | |
182 "movq %%mm0, 32(%1) \n\t" | |
183 "movq %%mm3, 48(%1) \n\t" | |
184 : "=m"(tmp) | |
185 : "r"(b2+32*i) | |
186 : "memory" | |
187 ); | |
188 } | |
189 | |
190 for(i=0; i<2; i++){ | |
191 h264_idct8_1d(b2+4*i); | |
192 | |
193 __asm__ volatile( | |
194 "psraw $6, %%mm7 \n\t" | |
195 "psraw $6, %%mm6 \n\t" | |
196 "psraw $6, %%mm5 \n\t" | |
197 "psraw $6, %%mm4 \n\t" | |
198 "psraw $6, %%mm3 \n\t" | |
199 "psraw $6, %%mm2 \n\t" | |
200 "psraw $6, %%mm1 \n\t" | |
201 "psraw $6, %%mm0 \n\t" | |
202 | |
203 "movq %%mm7, (%0) \n\t" | |
204 "movq %%mm5, 16(%0) \n\t" | |
205 "movq %%mm3, 32(%0) \n\t" | |
206 "movq %%mm1, 48(%0) \n\t" | |
207 "movq %%mm0, 64(%0) \n\t" | |
208 "movq %%mm2, 80(%0) \n\t" | |
209 "movq %%mm4, 96(%0) \n\t" | |
210 "movq %%mm6, 112(%0) \n\t" | |
211 :: "r"(b2+4*i) | |
212 : "memory" | |
213 ); | |
214 } | |
215 | |
12435
fe78a4548d12
Put ff_ prefix on non-static {put_signed,put,add}_pixels_clamped_mmx()
rbultje
parents:
12414
diff
changeset
|
216 ff_add_pixels_clamped_mmx(b2, dst, stride); |
8430 | 217 } |
218 | |
219 #define STORE_DIFF_8P( p, d, t, z )\ | |
220 "movq "#d", "#t" \n"\ | |
221 "psraw $6, "#p" \n"\ | |
222 "punpcklbw "#z", "#t" \n"\ | |
223 "paddsw "#t", "#p" \n"\ | |
224 "packuswb "#p", "#p" \n"\ | |
225 "movq "#p", "#d" \n" | |
226 | |
227 #define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\ | |
228 "movdqa "#c", "#a" \n"\ | |
229 "movdqa "#g", "#e" \n"\ | |
230 "psraw $1, "#c" \n"\ | |
231 "psraw $1, "#g" \n"\ | |
232 "psubw "#e", "#c" \n"\ | |
233 "paddw "#a", "#g" \n"\ | |
234 "movdqa "#b", "#e" \n"\ | |
235 "psraw $1, "#e" \n"\ | |
236 "paddw "#b", "#e" \n"\ | |
237 "paddw "#d", "#e" \n"\ | |
238 "paddw "#f", "#e" \n"\ | |
239 "movdqa "#f", "#a" \n"\ | |
240 "psraw $1, "#a" \n"\ | |
241 "paddw "#f", "#a" \n"\ | |
242 "paddw "#h", "#a" \n"\ | |
243 "psubw "#b", "#a" \n"\ | |
244 "psubw "#d", "#b" \n"\ | |
245 "psubw "#d", "#f" \n"\ | |
246 "paddw "#h", "#b" \n"\ | |
247 "psubw "#h", "#f" \n"\ | |
248 "psraw $1, "#d" \n"\ | |
249 "psraw $1, "#h" \n"\ | |
250 "psubw "#d", "#b" \n"\ | |
251 "psubw "#h", "#f" \n"\ | |
252 "movdqa "#e", "#d" \n"\ | |
253 "movdqa "#a", "#h" \n"\ | |
254 "psraw $2, "#d" \n"\ | |
255 "psraw $2, "#h" \n"\ | |
256 "paddw "#f", "#d" \n"\ | |
257 "paddw "#b", "#h" \n"\ | |
258 "psraw $2, "#f" \n"\ | |
259 "psraw $2, "#b" \n"\ | |
260 "psubw "#f", "#e" \n"\ | |
261 "psubw "#a", "#b" \n"\ | |
262 "movdqa 0x00(%1), "#a" \n"\ | |
263 "movdqa 0x40(%1), "#f" \n"\ | |
264 SUMSUB_BA(f, a)\ | |
265 SUMSUB_BA(g, f)\ | |
266 SUMSUB_BA(c, a)\ | |
267 SUMSUB_BA(e, g)\ | |
268 SUMSUB_BA(b, c)\ | |
269 SUMSUB_BA(h, a)\ | |
270 SUMSUB_BA(d, f) | |
271 | |
272 static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) | |
273 { | |
274 __asm__ volatile( | |
275 "movdqa 0x10(%1), %%xmm1 \n" | |
276 "movdqa 0x20(%1), %%xmm2 \n" | |
277 "movdqa 0x30(%1), %%xmm3 \n" | |
278 "movdqa 0x50(%1), %%xmm5 \n" | |
279 "movdqa 0x60(%1), %%xmm6 \n" | |
280 "movdqa 0x70(%1), %%xmm7 \n" | |
281 H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7) | |
282 TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1)) | |
283 "paddw %4, %%xmm4 \n" | |
284 "movdqa %%xmm4, 0x00(%1) \n" | |
285 "movdqa %%xmm2, 0x40(%1) \n" | |
286 H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1) | |
287 "movdqa %%xmm6, 0x60(%1) \n" | |
288 "movdqa %%xmm7, 0x70(%1) \n" | |
289 "pxor %%xmm7, %%xmm7 \n" | |
290 STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7) | |
291 STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7) | |
292 STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7) | |
293 STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7) | |
294 "lea (%0,%2,4), %0 \n" | |
295 STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7) | |
296 STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7) | |
297 "movdqa 0x60(%1), %%xmm0 \n" | |
298 "movdqa 0x70(%1), %%xmm1 \n" | |
299 STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7) | |
300 STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7) | |
301 :"+r"(dst) | |
302 :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32) | |
303 ); | |
304 } | |
305 | |
306 static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) | |
307 { | |
308 int dc = (block[0] + 32) >> 6; | |
309 __asm__ volatile( | |
310 "movd %0, %%mm0 \n\t" | |
311 "pshufw $0, %%mm0, %%mm0 \n\t" | |
312 "pxor %%mm1, %%mm1 \n\t" | |
313 "psubw %%mm0, %%mm1 \n\t" | |
314 "packuswb %%mm0, %%mm0 \n\t" | |
315 "packuswb %%mm1, %%mm1 \n\t" | |
316 ::"r"(dc) | |
317 ); | |
318 __asm__ volatile( | |
319 "movd %0, %%mm2 \n\t" | |
320 "movd %1, %%mm3 \n\t" | |
321 "movd %2, %%mm4 \n\t" | |
322 "movd %3, %%mm5 \n\t" | |
323 "paddusb %%mm0, %%mm2 \n\t" | |
324 "paddusb %%mm0, %%mm3 \n\t" | |
325 "paddusb %%mm0, %%mm4 \n\t" | |
326 "paddusb %%mm0, %%mm5 \n\t" | |
327 "psubusb %%mm1, %%mm2 \n\t" | |
328 "psubusb %%mm1, %%mm3 \n\t" | |
329 "psubusb %%mm1, %%mm4 \n\t" | |
330 "psubusb %%mm1, %%mm5 \n\t" | |
331 "movd %%mm2, %0 \n\t" | |
332 "movd %%mm3, %1 \n\t" | |
333 "movd %%mm4, %2 \n\t" | |
334 "movd %%mm5, %3 \n\t" | |
335 :"+m"(*(uint32_t*)(dst+0*stride)), | |
336 "+m"(*(uint32_t*)(dst+1*stride)), | |
337 "+m"(*(uint32_t*)(dst+2*stride)), | |
338 "+m"(*(uint32_t*)(dst+3*stride)) | |
339 ); | |
340 } | |
341 | |
342 static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) | |
343 { | |
344 int dc = (block[0] + 32) >> 6; | |
345 int y; | |
346 __asm__ volatile( | |
347 "movd %0, %%mm0 \n\t" | |
348 "pshufw $0, %%mm0, %%mm0 \n\t" | |
349 "pxor %%mm1, %%mm1 \n\t" | |
350 "psubw %%mm0, %%mm1 \n\t" | |
351 "packuswb %%mm0, %%mm0 \n\t" | |
352 "packuswb %%mm1, %%mm1 \n\t" | |
353 ::"r"(dc) | |
354 ); | |
355 for(y=2; y--; dst += 4*stride){ | |
356 __asm__ volatile( | |
357 "movq %0, %%mm2 \n\t" | |
358 "movq %1, %%mm3 \n\t" | |
359 "movq %2, %%mm4 \n\t" | |
360 "movq %3, %%mm5 \n\t" | |
361 "paddusb %%mm0, %%mm2 \n\t" | |
362 "paddusb %%mm0, %%mm3 \n\t" | |
363 "paddusb %%mm0, %%mm4 \n\t" | |
364 "paddusb %%mm0, %%mm5 \n\t" | |
365 "psubusb %%mm1, %%mm2 \n\t" | |
366 "psubusb %%mm1, %%mm3 \n\t" | |
367 "psubusb %%mm1, %%mm4 \n\t" | |
368 "psubusb %%mm1, %%mm5 \n\t" | |
369 "movq %%mm2, %0 \n\t" | |
370 "movq %%mm3, %1 \n\t" | |
371 "movq %%mm4, %2 \n\t" | |
372 "movq %%mm5, %3 \n\t" | |
373 :"+m"(*(uint64_t*)(dst+0*stride)), | |
374 "+m"(*(uint64_t*)(dst+1*stride)), | |
375 "+m"(*(uint64_t*)(dst+2*stride)), | |
376 "+m"(*(uint64_t*)(dst+3*stride)) | |
377 ); | |
378 } | |
379 } | |
380 | |
381 //FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split | |
382 static const uint8_t scan8[16 + 2*4]={ | |
383 4+1*8, 5+1*8, 4+2*8, 5+2*8, | |
384 6+1*8, 7+1*8, 6+2*8, 7+2*8, | |
385 4+3*8, 5+3*8, 4+4*8, 5+4*8, | |
386 6+3*8, 7+3*8, 6+4*8, 7+4*8, | |
387 1+1*8, 2+1*8, | |
388 1+2*8, 2+2*8, | |
389 1+4*8, 2+4*8, | |
390 1+5*8, 2+5*8, | |
391 }; | |
392 | |
393 static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |
394 int i; | |
395 for(i=0; i<16; i++){ | |
396 if(nnzc[ scan8[i] ]) | |
397 ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride); | |
398 } | |
399 } | |
400 | |
401 static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |
402 int i; | |
403 for(i=0; i<16; i+=4){ | |
404 if(nnzc[ scan8[i] ]) | |
405 ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride); | |
406 } | |
407 } | |
408 | |
409 | |
410 static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |
411 int i; | |
412 for(i=0; i<16; i++){ | |
413 int nnz = nnzc[ scan8[i] ]; | |
414 if(nnz){ | |
415 if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); | |
416 else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); | |
417 } | |
418 } | |
419 } | |
420 | |
421 static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |
422 int i; | |
423 for(i=0; i<16; i++){ | |
424 if(nnzc[ scan8[i] ] || block[i*16]) | |
425 ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride); | |
426 } | |
427 } | |
428 | |
429 static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |
430 int i; | |
431 for(i=0; i<16; i++){ | |
432 if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); | |
433 else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); | |
434 } | |
435 } | |
436 | |
437 static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |
438 int i; | |
439 for(i=0; i<16; i+=4){ | |
440 int nnz = nnzc[ scan8[i] ]; | |
441 if(nnz){ | |
442 if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); | |
443 else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride); | |
444 } | |
445 } | |
446 } | |
447 | |
448 static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |
449 int i; | |
450 for(i=0; i<16; i+=4){ | |
451 int nnz = nnzc[ scan8[i] ]; | |
452 if(nnz){ | |
453 if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); | |
454 else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride); | |
455 } | |
456 } | |
457 } | |
458 | |
459 static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |
460 int i; | |
461 for(i=16; i<16+8; i++){ | |
462 if(nnzc[ scan8[i] ] || block[i*16]) | |
463 ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); | |
464 } | |
465 } | |
466 | |
467 static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |
468 int i; | |
469 for(i=16; i<16+8; i++){ | |
470 if(nnzc[ scan8[i] ]) | |
471 ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); | |
472 else if(block[i*16]) | |
473 ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); | |
474 } | |
475 } | |
476 | |
8590 | 477 #if CONFIG_GPL && HAVE_YASM |
8510
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
478 static void ff_h264_idct_dc_add8_mmx2(uint8_t *dst, int16_t *block, int stride) |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
479 { |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
480 __asm__ volatile( |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
481 "movd %0, %%mm0 \n\t" // 0 0 X D |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
482 "punpcklwd %1, %%mm0 \n\t" // x X d D |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
483 "paddsw %2, %%mm0 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
484 "psraw $6, %%mm0 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
485 "punpcklwd %%mm0, %%mm0 \n\t" // d d D D |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
486 "pxor %%mm1, %%mm1 \n\t" // 0 0 0 0 |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
487 "psubw %%mm0, %%mm1 \n\t" // -d-d-D-D |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
488 "packuswb %%mm1, %%mm0 \n\t" // -d-d-D-D d d D D |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
489 "pshufw $0xFA, %%mm0, %%mm1 \n\t" // -d-d-d-d-D-D-D-D |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
490 "punpcklwd %%mm0, %%mm0 \n\t" // d d d d D D D D |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
491 ::"m"(block[ 0]), |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
492 "m"(block[16]), |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
493 "m"(ff_pw_32) |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
494 ); |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
495 __asm__ volatile( |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
496 "movq %0, %%mm2 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
497 "movq %1, %%mm3 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
498 "movq %2, %%mm4 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
499 "movq %3, %%mm5 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
500 "paddusb %%mm0, %%mm2 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
501 "paddusb %%mm0, %%mm3 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
502 "paddusb %%mm0, %%mm4 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
503 "paddusb %%mm0, %%mm5 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
504 "psubusb %%mm1, %%mm2 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
505 "psubusb %%mm1, %%mm3 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
506 "psubusb %%mm1, %%mm4 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
507 "psubusb %%mm1, %%mm5 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
508 "movq %%mm2, %0 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
509 "movq %%mm3, %1 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
510 "movq %%mm4, %2 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
511 "movq %%mm5, %3 \n\t" |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
512 :"+m"(*(uint64_t*)(dst+0*stride)), |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
513 "+m"(*(uint64_t*)(dst+1*stride)), |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
514 "+m"(*(uint64_t*)(dst+2*stride)), |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
515 "+m"(*(uint64_t*)(dst+3*stride)) |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
516 ); |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
517 } |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
518 |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
519 extern void ff_x264_add8x4_idct_sse2(uint8_t *dst, int16_t *block, int stride); |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
520 |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
521 static void ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
522 int i; |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
523 for(i=0; i<16; i+=2) |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
524 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ]) |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
525 ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride); |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
526 } |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
527 |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
528 static void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
529 int i; |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
530 for(i=0; i<16; i+=2){ |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
531 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ]) |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
532 ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride); |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
533 else if(block[i*16]|block[i*16+16]) |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
534 ff_h264_idct_dc_add8_mmx2(dst + block_offset[i], block + i*16, stride); |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
535 } |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
536 } |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
537 |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
538 static void ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
539 int i; |
8758
93980b03673e
fix typo in h264dsp_mmx (no effect currently as the function is not used), approved by Dark Shikari on IRC
bcoudurier
parents:
8590
diff
changeset
|
540 for(i=16; i<16+8; i+=2){ |
8510
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
541 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ]) |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
542 ff_x264_add8x4_idct_sse2 (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
543 else if(block[i*16]|block[i*16+16]) |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
544 ff_h264_idct_dc_add8_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
545 } |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
546 } |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
547 #endif |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8430
diff
changeset
|
548 |
8430 | 549 /***********************************/ |
550 /* deblocking */ | |
551 | |
552 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], | |
553 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { | |
554 int dir; | |
555 __asm__ volatile( | |
11017 | 556 "movq %0, %%mm7 \n" |
557 "movq %1, %%mm6 \n" | |
558 ::"m"(ff_pb_1), "m"(ff_pb_3) | |
8430 | 559 ); |
560 if(field) | |
561 __asm__ volatile( | |
11017 | 562 "movq %0, %%mm6 \n" |
563 ::"m"(ff_pb_3_1) | |
8430 | 564 ); |
11017 | 565 __asm__ volatile( |
566 "movq %%mm6, %%mm5 \n" | |
567 "paddb %%mm5, %%mm5 \n" | |
568 :); | |
8430 | 569 |
570 // could do a special case for dir==0 && edges==1, but it only reduces the | |
571 // average filter time by 1.2% | |
572 for( dir=1; dir>=0; dir-- ) { | |
10953 | 573 const x86_reg d_idx = dir ? -8 : -1; |
8430 | 574 const int mask_mv = dir ? mask_mv1 : mask_mv0; |
11369 | 575 DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; |
11017 | 576 int b_idx, edge; |
8430 | 577 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { |
578 __asm__ volatile( | |
579 "pand %0, %%mm0 \n\t" | |
580 ::"m"(mask_dir) | |
581 ); | |
582 if(!(mask_mv & edge)) { | |
11017 | 583 if(bidir) { |
8430 | 584 __asm__ volatile( |
11017 | 585 "movd (%1,%0), %%mm2 \n" |
586 "punpckldq 40(%1,%0), %%mm2 \n" // { ref0[bn], ref1[bn] } | |
587 "pshufw $0x44, (%1), %%mm0 \n" // { ref0[b], ref0[b] } | |
588 "pshufw $0x44, 40(%1), %%mm1 \n" // { ref1[b], ref1[b] } | |
589 "pshufw $0x4E, %%mm2, %%mm3 \n" | |
590 "psubb %%mm2, %%mm0 \n" // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] } | |
591 "psubb %%mm3, %%mm1 \n" // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] } | |
592 "1: \n" | |
593 "por %%mm1, %%mm0 \n" | |
594 "movq (%2,%0,4), %%mm1 \n" | |
595 "movq 8(%2,%0,4), %%mm2 \n" | |
596 "movq %%mm1, %%mm3 \n" | |
597 "movq %%mm2, %%mm4 \n" | |
598 "psubw (%2), %%mm1 \n" | |
599 "psubw 8(%2), %%mm2 \n" | |
600 "psubw 160(%2), %%mm3 \n" | |
601 "psubw 168(%2), %%mm4 \n" | |
602 "packsswb %%mm2, %%mm1 \n" | |
603 "packsswb %%mm4, %%mm3 \n" | |
604 "paddb %%mm6, %%mm1 \n" | |
605 "paddb %%mm6, %%mm3 \n" | |
606 "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit | |
607 "psubusb %%mm5, %%mm3 \n" | |
608 "packsswb %%mm3, %%mm1 \n" | |
609 "add $40, %0 \n" | |
610 "cmp $40, %0 \n" | |
611 "jl 1b \n" | |
612 "sub $80, %0 \n" | |
613 "pshufw $0x4E, %%mm1, %%mm1 \n" | |
614 "por %%mm1, %%mm0 \n" | |
615 "pshufw $0x4E, %%mm0, %%mm1 \n" | |
616 "pminub %%mm1, %%mm0 \n" | |
617 ::"r"(d_idx), | |
618 "r"(ref[0]+b_idx), | |
619 "r"(mv[0]+b_idx) | |
8430 | 620 ); |
11017 | 621 } else { |
10938
2a50b786b888
Fix h264_loop_filter_strength_mmx2() so it works with b frames.
michael
parents:
10937
diff
changeset
|
622 __asm__ volatile( |
11017 | 623 "movd (%1), %%mm0 \n" |
624 "psubb (%1,%0), %%mm0 \n" // ref[b] != ref[bn] | |
625 "movq (%2), %%mm1 \n" | |
626 "movq 8(%2), %%mm2 \n" | |
627 "psubw (%2,%0,4), %%mm1 \n" | |
628 "psubw 8(%2,%0,4), %%mm2 \n" | |
629 "packsswb %%mm2, %%mm1 \n" | |
630 "paddb %%mm6, %%mm1 \n" | |
631 "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit | |
632 "packsswb %%mm1, %%mm1 \n" | |
633 "por %%mm1, %%mm0 \n" | |
634 ::"r"(d_idx), | |
635 "r"(ref[0]+b_idx), | |
636 "r"(mv[0]+b_idx) | |
10938
2a50b786b888
Fix h264_loop_filter_strength_mmx2() so it works with b frames.
michael
parents:
10937
diff
changeset
|
637 ); |
2a50b786b888
Fix h264_loop_filter_strength_mmx2() so it works with b frames.
michael
parents:
10937
diff
changeset
|
638 } |
8430 | 639 } |
640 __asm__ volatile( | |
11017 | 641 "movd %0, %%mm1 \n" |
642 "por %1, %%mm1 \n" // nnz[b] || nnz[bn] | |
8430 | 643 ::"m"(nnz[b_idx]), |
644 "m"(nnz[b_idx+d_idx]) | |
645 ); | |
646 __asm__ volatile( | |
11017 | 647 "pminub %%mm7, %%mm1 \n" |
648 "pminub %%mm7, %%mm0 \n" | |
649 "psllw $1, %%mm1 \n" | |
650 "pxor %%mm2, %%mm2 \n" | |
651 "pmaxub %%mm0, %%mm1 \n" | |
652 "punpcklbw %%mm2, %%mm1 \n" | |
653 "movq %%mm1, %0 \n" | |
8430 | 654 :"=m"(*bS[dir][edge]) |
655 ::"memory" | |
656 ); | |
657 } | |
658 edges = 4; | |
659 step = 1; | |
660 } | |
661 __asm__ volatile( | |
662 "movq (%0), %%mm0 \n\t" | |
663 "movq 8(%0), %%mm1 \n\t" | |
664 "movq 16(%0), %%mm2 \n\t" | |
665 "movq 24(%0), %%mm3 \n\t" | |
666 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4) | |
667 "movq %%mm0, (%0) \n\t" | |
668 "movq %%mm3, 8(%0) \n\t" | |
669 "movq %%mm4, 16(%0) \n\t" | |
670 "movq %%mm2, 24(%0) \n\t" | |
671 ::"r"(bS[0]) | |
672 :"memory" | |
673 ); | |
674 } | |
675 | |
12454
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
676 #define LF_FUNC(DIR, TYPE, OPT) \ |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
677 void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
678 int alpha, int beta, int8_t *tc0); |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
679 #define LF_IFUNC(DIR, TYPE, OPT) \ |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
680 void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
681 int alpha, int beta); |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
682 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
683 LF_FUNC (h, chroma, mmxext) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
684 LF_IFUNC(h, chroma_intra, mmxext) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
685 LF_FUNC (v, chroma, mmxext) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
686 LF_IFUNC(v, chroma_intra, mmxext) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
687 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
688 LF_FUNC (h, luma, mmxext) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
689 LF_IFUNC(h, luma_intra, mmxext) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
690 #if HAVE_YASM && ARCH_X86_32 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
691 LF_FUNC (v8, luma, mmxext) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
692 static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
693 { |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
694 if((tc0[0] & tc0[1]) >= 0) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
695 ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0); |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
696 if((tc0[2] & tc0[3]) >= 0) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
697 ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2); |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
698 } |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
699 LF_IFUNC(v8, luma_intra, mmxext) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
700 static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
701 { |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
702 ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
703 ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
704 } |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
705 #endif |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
706 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
707 LF_FUNC (h, luma, sse2) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
708 LF_IFUNC(h, luma_intra, sse2) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
709 LF_FUNC (v, luma, sse2) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
710 LF_IFUNC(v, luma_intra, sse2) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
711 |
8430 | 712 /***********************************/ |
713 /* weighted prediction */ | |
714 | |
12451
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
715 #define H264_WEIGHT(W, H, OPT) \ |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
716 void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
717 int stride, int log2_denom, int weight, int offset); |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
718 |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
719 #define H264_BIWEIGHT(W, H, OPT) \ |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
720 void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
721 uint8_t *src, int stride, int log2_denom, int weightd, \ |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
722 int weights, int offset); |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
723 |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
724 #define H264_BIWEIGHT_MMX(W,H) \ |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
725 H264_WEIGHT (W, H, mmx2) \ |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
726 H264_BIWEIGHT(W, H, mmx2) |
8430 | 727 |
12451
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
728 #define H264_BIWEIGHT_MMX_SSE(W,H) \ |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
729 H264_BIWEIGHT_MMX(W, H) \ |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
730 H264_WEIGHT (W, H, sse2) \ |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
731 H264_BIWEIGHT (W, H, sse2) \ |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
732 H264_BIWEIGHT (W, H, ssse3) |
8430 | 733 |
12451
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
734 H264_BIWEIGHT_MMX_SSE(16, 16) |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
735 H264_BIWEIGHT_MMX_SSE(16, 8) |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
736 H264_BIWEIGHT_MMX_SSE( 8, 16) |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
737 H264_BIWEIGHT_MMX_SSE( 8, 8) |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
738 H264_BIWEIGHT_MMX_SSE( 8, 4) |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
739 H264_BIWEIGHT_MMX ( 4, 8) |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
740 H264_BIWEIGHT_MMX ( 4, 4) |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
741 H264_BIWEIGHT_MMX ( 4, 2) |
12450
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
742 |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
743 void ff_h264dsp_init_x86(H264DSPContext *c) |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
744 { |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
745 int mm_flags = mm_support(); |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
746 |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
747 if (mm_flags & FF_MM_MMX) { |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
748 c->h264_idct_dc_add= |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
749 c->h264_idct_add= ff_h264_idct_add_mmx; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
750 c->h264_idct8_dc_add= |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
751 c->h264_idct8_add= ff_h264_idct8_add_mmx; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
752 |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
753 c->h264_idct_add16 = ff_h264_idct_add16_mmx; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
754 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
755 c->h264_idct_add8 = ff_h264_idct_add8_mmx; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
756 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
757 |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
758 if (mm_flags & FF_MM_MMX2) { |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
759 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
760 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
761 c->h264_idct_add16 = ff_h264_idct_add16_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
762 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
763 c->h264_idct_add8 = ff_h264_idct_add8_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
764 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
765 |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
766 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; |
12451
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
767 } |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
768 if(mm_flags & FF_MM_SSE2){ |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
769 c->h264_idct8_add = ff_h264_idct8_add_sse2; |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
770 c->h264_idct8_add4= ff_h264_idct8_add4_sse2; |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
771 } |
12450
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
772 |
12451
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
773 #if HAVE_YASM |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
774 if (mm_flags & FF_MM_MMX2){ |
12454
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
775 c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext; |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
776 c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext; |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
777 c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext; |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
778 c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext; |
12451
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
779 #if ARCH_X86_32 |
12454
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
780 c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext; |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12451
diff
changeset
|
781 c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext; |
12451
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
782 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
783 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
784 #endif |
12450
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
785 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
786 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
787 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
788 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
789 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
790 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
791 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
792 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
793 |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
794 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
795 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
796 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
797 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
798 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
799 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
800 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
801 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
802 |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
803 if( mm_flags&FF_MM_SSE2 ){ |
12451
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
804 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
805 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
806 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2; |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
807 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2; |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
808 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2; |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
809 |
12450
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
810 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2; |
12451
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
811 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2; |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
812 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2; |
12450
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
813 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2; |
12451
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
814 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2; |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
815 |
12450
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
816 #if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110 |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
817 c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
818 c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
819 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
820 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
821 #endif |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
822 #if CONFIG_GPL |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
823 c->h264_idct_add16 = ff_h264_idct_add16_sse2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
824 c->h264_idct_add8 = ff_h264_idct_add8_sse2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
825 c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2; |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
826 #endif |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
827 } |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
828 if ( mm_flags&FF_MM_SSSE3 ){ |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
829 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3; |
12451
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
830 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3; |
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
831 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3; |
12450
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
832 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; |
12451
4c3e6ff1237e
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
rbultje
parents:
12450
diff
changeset
|
833 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3; |
12450
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
834 } |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
835 } |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
836 #endif |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
837 } |
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12438
diff
changeset
|
838 } |