Mercurial > libavcodec.hg
annotate x86/vc1dsp_mmx.c @ 12454:f4355cd85faa libavcodec
Port latest x264 deblock asm (before they moved to using NV12 as internal
format), LGPL'ed with permission from Jason and Loren. This includes mmx2
code, so remove inline asm from h264dsp_mmx.c accordingly.
author | rbultje |
---|---|
date | Fri, 03 Sep 2010 16:52:46 +0000 |
parents | 3fc4c625b6f3 |
children | a5ddb39627fd |
rev | line source |
---|---|
8430 | 1 /* |
2 * VC-1 and WMV3 - DSP functions MMX-optimized | |
3 * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> | |
4 * | |
5 * Permission is hereby granted, free of charge, to any person | |
6 * obtaining a copy of this software and associated documentation | |
7 * files (the "Software"), to deal in the Software without | |
8 * restriction, including without limitation the rights to use, | |
9 * copy, modify, merge, publish, distribute, sublicense, and/or sell | |
10 * copies of the Software, and to permit persons to whom the | |
11 * Software is furnished to do so, subject to the following | |
12 * conditions: | |
13 * | |
14 * The above copyright notice and this permission notice shall be | |
15 * included in all copies or substantial portions of the Software. | |
16 * | |
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | |
19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
24 * OTHER DEALINGS IN THE SOFTWARE. | |
25 */ | |
26 | |
27 #include "libavutil/x86_cpu.h" | |
28 #include "libavcodec/dsputil.h" | |
29 #include "dsputil_mmx.h" | |
30 | |
9441 | 31 #define OP_PUT(S,D) |
32 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t" | |
33 | |
8430 | 34 /** Add rounder from mm7 to mm3 and pack result at destination */ |
35 #define NORMALIZE_MMX(SHIFT) \ | |
36 "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \ | |
37 "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \ | |
38 "psraw "SHIFT", %%mm3 \n\t" \ | |
39 "psraw "SHIFT", %%mm4 \n\t" | |
40 | |
9441 | 41 #define TRANSFER_DO_PACK(OP) \ |
8430 | 42 "packuswb %%mm4, %%mm3 \n\t" \ |
9441 | 43 OP((%2), %%mm3) \ |
8430 | 44 "movq %%mm3, (%2) \n\t" |
45 | |
9441 | 46 #define TRANSFER_DONT_PACK(OP) \ |
47 OP(0(%2), %%mm3) \ | |
48 OP(8(%2), %%mm4) \ | |
8430 | 49 "movq %%mm3, 0(%2) \n\t" \ |
50 "movq %%mm4, 8(%2) \n\t" | |
51 | |
52 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */ | |
53 #define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t" | |
54 #define DONT_UNPACK(reg) | |
55 | |
56 /** Compute the rounder 32-r or 8-r and unpacks it to mm7 */ | |
57 #define LOAD_ROUNDER_MMX(ROUND) \ | |
58 "movd "ROUND", %%mm7 \n\t" \ | |
59 "punpcklwd %%mm7, %%mm7 \n\t" \ | |
60 "punpckldq %%mm7, %%mm7 \n\t" | |
61 | |
62 #define SHIFT2_LINE(OFF, R0,R1,R2,R3) \ | |
63 "paddw %%mm"#R2", %%mm"#R1" \n\t" \ | |
64 "movd (%0,%3), %%mm"#R0" \n\t" \ | |
65 "pmullw %%mm6, %%mm"#R1" \n\t" \ | |
66 "punpcklbw %%mm0, %%mm"#R0" \n\t" \ | |
67 "movd (%0,%2), %%mm"#R3" \n\t" \ | |
68 "psubw %%mm"#R0", %%mm"#R1" \n\t" \ | |
69 "punpcklbw %%mm0, %%mm"#R3" \n\t" \ | |
70 "paddw %%mm7, %%mm"#R1" \n\t" \ | |
71 "psubw %%mm"#R3", %%mm"#R1" \n\t" \ | |
72 "psraw %4, %%mm"#R1" \n\t" \ | |
73 "movq %%mm"#R1", "#OFF"(%1) \n\t" \ | |
74 "add %2, %0 \n\t" | |
75 | |
76 /** Sacrifying mm6 allows to pipeline loads from src */ | |
77 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, | |
78 const uint8_t *src, x86_reg stride, | |
79 int rnd, int64_t shift) | |
80 { | |
81 __asm__ volatile( | |
82 "mov $3, %%"REG_c" \n\t" | |
83 LOAD_ROUNDER_MMX("%5") | |
84 "movq "MANGLE(ff_pw_9)", %%mm6 \n\t" | |
85 "1: \n\t" | |
86 "movd (%0), %%mm2 \n\t" | |
87 "add %2, %0 \n\t" | |
88 "movd (%0), %%mm3 \n\t" | |
89 "punpcklbw %%mm0, %%mm2 \n\t" | |
90 "punpcklbw %%mm0, %%mm3 \n\t" | |
91 SHIFT2_LINE( 0, 1, 2, 3, 4) | |
92 SHIFT2_LINE( 24, 2, 3, 4, 1) | |
93 SHIFT2_LINE( 48, 3, 4, 1, 2) | |
94 SHIFT2_LINE( 72, 4, 1, 2, 3) | |
95 SHIFT2_LINE( 96, 1, 2, 3, 4) | |
96 SHIFT2_LINE(120, 2, 3, 4, 1) | |
97 SHIFT2_LINE(144, 3, 4, 1, 2) | |
98 SHIFT2_LINE(168, 4, 1, 2, 3) | |
99 "sub %6, %0 \n\t" | |
100 "add $8, %1 \n\t" | |
101 "dec %%"REG_c" \n\t" | |
102 "jnz 1b \n\t" | |
103 : "+r"(src), "+r"(dst) | |
104 : "r"(stride), "r"(-2*stride), | |
105 "m"(shift), "m"(rnd), "r"(9*stride-4) | |
106 : "%"REG_c, "memory" | |
107 ); | |
108 } | |
109 | |
110 /** | |
111 * Data is already unpacked, so some operations can directly be made from | |
112 * memory. | |
113 */ | |
9441 | 114 #define VC1_HOR_16b_SHIFT2(OP, OPNAME)\ |
115 static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\ | |
116 const int16_t *src, int rnd)\ | |
117 {\ | |
118 int h = 8;\ | |
119 \ | |
120 src -= 1;\ | |
121 rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\ | |
122 __asm__ volatile(\ | |
123 LOAD_ROUNDER_MMX("%4")\ | |
124 "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\ | |
125 "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\ | |
126 "1: \n\t"\ | |
127 "movq 2*0+0(%1), %%mm1 \n\t"\ | |
128 "movq 2*0+8(%1), %%mm2 \n\t"\ | |
129 "movq 2*1+0(%1), %%mm3 \n\t"\ | |
130 "movq 2*1+8(%1), %%mm4 \n\t"\ | |
131 "paddw 2*3+0(%1), %%mm1 \n\t"\ | |
132 "paddw 2*3+8(%1), %%mm2 \n\t"\ | |
133 "paddw 2*2+0(%1), %%mm3 \n\t"\ | |
134 "paddw 2*2+8(%1), %%mm4 \n\t"\ | |
135 "pmullw %%mm5, %%mm3 \n\t"\ | |
136 "pmullw %%mm5, %%mm4 \n\t"\ | |
137 "psubw %%mm1, %%mm3 \n\t"\ | |
138 "psubw %%mm2, %%mm4 \n\t"\ | |
139 NORMALIZE_MMX("$7")\ | |
140 /* Remove bias */\ | |
141 "paddw %%mm6, %%mm3 \n\t"\ | |
142 "paddw %%mm6, %%mm4 \n\t"\ | |
143 TRANSFER_DO_PACK(OP)\ | |
144 "add $24, %1 \n\t"\ | |
145 "add %3, %2 \n\t"\ | |
146 "decl %0 \n\t"\ | |
147 "jnz 1b \n\t"\ | |
148 : "+r"(h), "+r" (src), "+r" (dst)\ | |
149 : "r"(stride), "m"(rnd)\ | |
150 : "memory"\ | |
151 );\ | |
152 } | |
8430 | 153 |
9441 | 154 VC1_HOR_16b_SHIFT2(OP_PUT, put_) |
155 VC1_HOR_16b_SHIFT2(OP_AVG, avg_) | |
8430 | 156 |
157 | |
158 /** | |
159 * Purely vertical or horizontal 1/2 shift interpolation. | |
160 * Sacrify mm6 for *9 factor. | |
161 */ | |
9441 | 162 #define VC1_SHIFT2(OP, OPNAME)\ |
163 static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\ | |
164 x86_reg stride, int rnd, x86_reg offset)\ | |
165 {\ | |
166 rnd = 8-rnd;\ | |
167 __asm__ volatile(\ | |
168 "mov $8, %%"REG_c" \n\t"\ | |
169 LOAD_ROUNDER_MMX("%5")\ | |
170 "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\ | |
171 "1: \n\t"\ | |
172 "movd 0(%0 ), %%mm3 \n\t"\ | |
173 "movd 4(%0 ), %%mm4 \n\t"\ | |
174 "movd 0(%0,%2), %%mm1 \n\t"\ | |
175 "movd 4(%0,%2), %%mm2 \n\t"\ | |
176 "add %2, %0 \n\t"\ | |
177 "punpcklbw %%mm0, %%mm3 \n\t"\ | |
178 "punpcklbw %%mm0, %%mm4 \n\t"\ | |
179 "punpcklbw %%mm0, %%mm1 \n\t"\ | |
180 "punpcklbw %%mm0, %%mm2 \n\t"\ | |
181 "paddw %%mm1, %%mm3 \n\t"\ | |
182 "paddw %%mm2, %%mm4 \n\t"\ | |
183 "movd 0(%0,%3), %%mm1 \n\t"\ | |
184 "movd 4(%0,%3), %%mm2 \n\t"\ | |
185 "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\ | |
186 "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\ | |
187 "punpcklbw %%mm0, %%mm1 \n\t"\ | |
188 "punpcklbw %%mm0, %%mm2 \n\t"\ | |
189 "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\ | |
190 "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\ | |
191 "movd 0(%0,%2), %%mm1 \n\t"\ | |
192 "movd 4(%0,%2), %%mm2 \n\t"\ | |
193 "punpcklbw %%mm0, %%mm1 \n\t"\ | |
194 "punpcklbw %%mm0, %%mm2 \n\t"\ | |
195 "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\ | |
196 "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\ | |
197 NORMALIZE_MMX("$4")\ | |
198 "packuswb %%mm4, %%mm3 \n\t"\ | |
199 OP((%1), %%mm3)\ | |
200 "movq %%mm3, (%1) \n\t"\ | |
201 "add %6, %0 \n\t"\ | |
202 "add %4, %1 \n\t"\ | |
203 "dec %%"REG_c" \n\t"\ | |
204 "jnz 1b \n\t"\ | |
205 : "+r"(src), "+r"(dst)\ | |
206 : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\ | |
207 "g"(stride-offset)\ | |
208 : "%"REG_c, "memory"\ | |
209 );\ | |
8430 | 210 } |
211 | |
9441 | 212 VC1_SHIFT2(OP_PUT, put_) |
213 VC1_SHIFT2(OP_AVG, avg_) | |
214 | |
8430 | 215 /** |
216 * Core of the 1/4 and 3/4 shift bicubic interpolation. | |
217 * | |
218 * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty). | |
219 * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked. | |
220 * @param A1 Address of 1st tap (beware of unpacked/packed). | |
221 * @param A2 Address of 2nd tap | |
222 * @param A3 Address of 3rd tap | |
223 * @param A4 Address of 4th tap | |
224 */ | |
225 #define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \ | |
226 MOVQ "*0+"A1", %%mm1 \n\t" \ | |
227 MOVQ "*4+"A1", %%mm2 \n\t" \ | |
228 UNPACK("%%mm1") \ | |
229 UNPACK("%%mm2") \ | |
230 "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \ | |
231 "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \ | |
232 MOVQ "*0+"A2", %%mm3 \n\t" \ | |
233 MOVQ "*4+"A2", %%mm4 \n\t" \ | |
234 UNPACK("%%mm3") \ | |
235 UNPACK("%%mm4") \ | |
236 "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ | |
237 "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \ | |
238 "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \ | |
239 "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \ | |
240 MOVQ "*0+"A4", %%mm1 \n\t" \ | |
241 MOVQ "*4+"A4", %%mm2 \n\t" \ | |
242 UNPACK("%%mm1") \ | |
243 UNPACK("%%mm2") \ | |
244 "psllw $2, %%mm1 \n\t" /* 4* */ \ | |
245 "psllw $2, %%mm2 \n\t" /* 4* */ \ | |
246 "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \ | |
247 "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \ | |
248 MOVQ "*0+"A3", %%mm1 \n\t" \ | |
249 MOVQ "*4+"A3", %%mm2 \n\t" \ | |
250 UNPACK("%%mm1") \ | |
251 UNPACK("%%mm2") \ | |
252 "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ | |
253 "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \ | |
254 "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \ | |
255 "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */ | |
256 | |
257 /** | |
258 * Macro to build the vertical 16bits version of vc1_put_shift[13]. | |
259 * Here, offset=src_stride. Parameters passed A1 to A4 must use | |
260 * %3 (src_stride) and %4 (3*src_stride). | |
261 * | |
262 * @param NAME Either 1 or 3 | |
263 * @see MSPEL_FILTER13_CORE for information on A1->A4 | |
264 */ | |
265 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \ | |
266 static void \ | |
267 vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \ | |
268 x86_reg src_stride, \ | |
269 int rnd, int64_t shift) \ | |
270 { \ | |
271 int h = 8; \ | |
272 src -= src_stride; \ | |
273 __asm__ volatile( \ | |
274 LOAD_ROUNDER_MMX("%5") \ | |
275 "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \ | |
276 "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \ | |
277 ASMALIGN(3) \ | |
278 "1: \n\t" \ | |
279 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ | |
280 NORMALIZE_MMX("%6") \ | |
9441 | 281 TRANSFER_DONT_PACK(OP_PUT) \ |
8430 | 282 /* Last 3 (in fact 4) bytes on the line */ \ |
283 "movd 8+"A1", %%mm1 \n\t" \ | |
284 DO_UNPACK("%%mm1") \ | |
285 "movq %%mm1, %%mm3 \n\t" \ | |
286 "paddw %%mm1, %%mm1 \n\t" \ | |
287 "paddw %%mm3, %%mm1 \n\t" /* 3* */ \ | |
288 "movd 8+"A2", %%mm3 \n\t" \ | |
289 DO_UNPACK("%%mm3") \ | |
290 "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ | |
291 "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \ | |
292 "movd 8+"A3", %%mm1 \n\t" \ | |
293 DO_UNPACK("%%mm1") \ | |
294 "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ | |
295 "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \ | |
296 "movd 8+"A4", %%mm1 \n\t" \ | |
297 DO_UNPACK("%%mm1") \ | |
298 "psllw $2, %%mm1 \n\t" /* 4* */ \ | |
299 "psubw %%mm1, %%mm3 \n\t" \ | |
300 "paddw %%mm7, %%mm3 \n\t" \ | |
301 "psraw %6, %%mm3 \n\t" \ | |
302 "movq %%mm3, 16(%2) \n\t" \ | |
303 "add %3, %1 \n\t" \ | |
304 "add $24, %2 \n\t" \ | |
305 "decl %0 \n\t" \ | |
306 "jnz 1b \n\t" \ | |
307 : "+r"(h), "+r" (src), "+r" (dst) \ | |
308 : "r"(src_stride), "r"(3*src_stride), \ | |
309 "m"(rnd), "m"(shift) \ | |
310 : "memory" \ | |
311 ); \ | |
312 } | |
313 | |
314 /** | |
315 * Macro to build the horizontal 16bits version of vc1_put_shift[13]. | |
316 * Here, offset=16bits, so parameters passed A1 to A4 should be simple. | |
317 * | |
318 * @param NAME Either 1 or 3 | |
319 * @see MSPEL_FILTER13_CORE for information on A1->A4 | |
320 */ | |
9441 | 321 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \ |
8430 | 322 static void \ |
9441 | 323 OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \ |
8430 | 324 const int16_t *src, int rnd) \ |
325 { \ | |
326 int h = 8; \ | |
327 src -= 1; \ | |
328 rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ | |
329 __asm__ volatile( \ | |
330 LOAD_ROUNDER_MMX("%4") \ | |
331 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ | |
332 "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ | |
333 ASMALIGN(3) \ | |
334 "1: \n\t" \ | |
335 MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \ | |
336 NORMALIZE_MMX("$7") \ | |
337 /* Remove bias */ \ | |
338 "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \ | |
339 "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \ | |
9441 | 340 TRANSFER_DO_PACK(OP) \ |
8430 | 341 "add $24, %1 \n\t" \ |
342 "add %3, %2 \n\t" \ | |
343 "decl %0 \n\t" \ | |
344 "jnz 1b \n\t" \ | |
345 : "+r"(h), "+r" (src), "+r" (dst) \ | |
346 : "r"(stride), "m"(rnd) \ | |
347 : "memory" \ | |
348 ); \ | |
349 } | |
350 | |
351 /** | |
352 * Macro to build the 8bits, any direction, version of vc1_put_shift[13]. | |
353 * Here, offset=src_stride. Parameters passed A1 to A4 must use | |
354 * %3 (offset) and %4 (3*offset). | |
355 * | |
356 * @param NAME Either 1 or 3 | |
357 * @see MSPEL_FILTER13_CORE for information on A1->A4 | |
358 */ | |
9441 | 359 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \ |
8430 | 360 static void \ |
9441 | 361 OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \ |
8430 | 362 x86_reg stride, int rnd, x86_reg offset) \ |
363 { \ | |
364 int h = 8; \ | |
365 src -= offset; \ | |
366 rnd = 32-rnd; \ | |
367 __asm__ volatile ( \ | |
368 LOAD_ROUNDER_MMX("%6") \ | |
369 "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ | |
370 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ | |
371 ASMALIGN(3) \ | |
372 "1: \n\t" \ | |
373 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ | |
374 NORMALIZE_MMX("$6") \ | |
9441 | 375 TRANSFER_DO_PACK(OP) \ |
8430 | 376 "add %5, %1 \n\t" \ |
377 "add %5, %2 \n\t" \ | |
378 "decl %0 \n\t" \ | |
379 "jnz 1b \n\t" \ | |
380 : "+r"(h), "+r" (src), "+r" (dst) \ | |
381 : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \ | |
382 : "memory" \ | |
383 ); \ | |
384 } | |
385 | |
386 /** 1/4 shift bicubic interpolation */ | |
9441 | 387 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_) |
388 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_) | |
8430 | 389 MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") |
9441 | 390 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_) |
391 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_) | |
8430 | 392 |
393 /** 3/4 shift bicubic interpolation */ | |
9441 | 394 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_) |
395 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_) | |
8430 | 396 MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") |
9441 | 397 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_) |
398 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_) | |
8430 | 399 |
400 typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift); | |
401 typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd); | |
402 typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset); | |
403 | |
404 /** | |
12024 | 405 * Interpolate fractional pel values by applying proper vertical then |
8430 | 406 * horizontal filter. |
407 * | |
408 * @param dst Destination buffer for interpolated pels. | |
409 * @param src Source buffer. | |
410 * @param stride Stride for both src and dst buffers. | |
411 * @param hmode Horizontal filter (expressed in quarter pixels shift). | |
412 * @param hmode Vertical filter. | |
413 * @param rnd Rounding bias. | |
414 */ | |
9441 | 415 #define VC1_MSPEL_MC(OP)\ |
416 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ | |
417 int hmode, int vmode, int rnd)\ | |
418 {\ | |
419 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ | |
420 { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\ | |
421 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ | |
422 { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\ | |
423 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\ | |
424 { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\ | |
425 \ | |
426 __asm__ volatile(\ | |
427 "pxor %%mm0, %%mm0 \n\t"\ | |
428 ::: "memory"\ | |
429 );\ | |
430 \ | |
431 if (vmode) { /* Vertical filter to apply */\ | |
432 if (hmode) { /* Horizontal filter to apply, output to tmp */\ | |
433 static const int shift_value[] = { 0, 5, 1, 5 };\ | |
434 int shift = (shift_value[hmode]+shift_value[vmode])>>1;\ | |
435 int r;\ | |
11369 | 436 DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\ |
9441 | 437 \ |
438 r = (1<<(shift-1)) + rnd-1;\ | |
439 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\ | |
440 \ | |
441 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ | |
442 return;\ | |
443 }\ | |
444 else { /* No horizontal filter, output 8 lines to dst */\ | |
445 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\ | |
446 return;\ | |
447 }\ | |
448 }\ | |
449 \ | |
450 /* Horizontal mode with no vertical mode */\ | |
451 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ | |
8430 | 452 } |
453 | |
9441 | 454 VC1_MSPEL_MC(put_) |
455 VC1_MSPEL_MC(avg_) | |
456 | |
8430 | 457 /** Macro to ease bicubic filter interpolation functions declarations */ |
458 #define DECLARE_FUNCTION(a, b) \ | |
459 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \ | |
9441 | 460 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ |
461 }\ | |
462 static void avg_vc1_mspel_mc ## a ## b ## _mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \ | |
463 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ | |
8430 | 464 } |
465 | |
466 DECLARE_FUNCTION(0, 1) | |
467 DECLARE_FUNCTION(0, 2) | |
468 DECLARE_FUNCTION(0, 3) | |
469 | |
470 DECLARE_FUNCTION(1, 0) | |
471 DECLARE_FUNCTION(1, 1) | |
472 DECLARE_FUNCTION(1, 2) | |
473 DECLARE_FUNCTION(1, 3) | |
474 | |
475 DECLARE_FUNCTION(2, 0) | |
476 DECLARE_FUNCTION(2, 1) | |
477 DECLARE_FUNCTION(2, 2) | |
478 DECLARE_FUNCTION(2, 3) | |
479 | |
480 DECLARE_FUNCTION(3, 0) | |
481 DECLARE_FUNCTION(3, 1) | |
482 DECLARE_FUNCTION(3, 2) | |
483 DECLARE_FUNCTION(3, 3) | |
484 | |
9859
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
485 static void vc1_inv_trans_4x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
486 { |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
487 int dc = block[0]; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
488 dc = (17 * dc + 4) >> 3; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
489 dc = (17 * dc + 64) >> 7; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
490 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
491 "movd %0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
492 "pshufw $0, %%mm0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
493 "pxor %%mm1, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
494 "psubw %%mm0, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
495 "packuswb %%mm0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
496 "packuswb %%mm1, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
497 ::"r"(dc) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
498 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
499 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
500 "movd %0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
501 "movd %1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
502 "movd %2, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
503 "movd %3, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
504 "paddusb %%mm0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
505 "paddusb %%mm0, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
506 "paddusb %%mm0, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
507 "paddusb %%mm0, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
508 "psubusb %%mm1, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
509 "psubusb %%mm1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
510 "psubusb %%mm1, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
511 "psubusb %%mm1, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
512 "movd %%mm2, %0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
513 "movd %%mm3, %1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
514 "movd %%mm4, %2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
515 "movd %%mm5, %3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
516 :"+m"(*(uint32_t*)(dest+0*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
517 "+m"(*(uint32_t*)(dest+1*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
518 "+m"(*(uint32_t*)(dest+2*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
519 "+m"(*(uint32_t*)(dest+3*linesize)) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
520 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
521 } |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
522 |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
523 static void vc1_inv_trans_4x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
524 { |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
525 int dc = block[0]; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
526 dc = (17 * dc + 4) >> 3; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
527 dc = (12 * dc + 64) >> 7; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
528 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
529 "movd %0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
530 "pshufw $0, %%mm0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
531 "pxor %%mm1, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
532 "psubw %%mm0, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
533 "packuswb %%mm0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
534 "packuswb %%mm1, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
535 ::"r"(dc) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
536 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
537 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
538 "movd %0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
539 "movd %1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
540 "movd %2, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
541 "movd %3, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
542 "paddusb %%mm0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
543 "paddusb %%mm0, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
544 "paddusb %%mm0, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
545 "paddusb %%mm0, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
546 "psubusb %%mm1, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
547 "psubusb %%mm1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
548 "psubusb %%mm1, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
549 "psubusb %%mm1, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
550 "movd %%mm2, %0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
551 "movd %%mm3, %1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
552 "movd %%mm4, %2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
553 "movd %%mm5, %3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
554 :"+m"(*(uint32_t*)(dest+0*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
555 "+m"(*(uint32_t*)(dest+1*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
556 "+m"(*(uint32_t*)(dest+2*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
557 "+m"(*(uint32_t*)(dest+3*linesize)) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
558 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
559 dest += 4*linesize; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
560 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
561 "movd %0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
562 "movd %1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
563 "movd %2, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
564 "movd %3, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
565 "paddusb %%mm0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
566 "paddusb %%mm0, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
567 "paddusb %%mm0, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
568 "paddusb %%mm0, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
569 "psubusb %%mm1, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
570 "psubusb %%mm1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
571 "psubusb %%mm1, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
572 "psubusb %%mm1, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
573 "movd %%mm2, %0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
574 "movd %%mm3, %1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
575 "movd %%mm4, %2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
576 "movd %%mm5, %3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
577 :"+m"(*(uint32_t*)(dest+0*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
578 "+m"(*(uint32_t*)(dest+1*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
579 "+m"(*(uint32_t*)(dest+2*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
580 "+m"(*(uint32_t*)(dest+3*linesize)) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
581 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
582 } |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
583 |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
584 static void vc1_inv_trans_8x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
585 { |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
586 int dc = block[0]; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
587 dc = ( 3 * dc + 1) >> 1; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
588 dc = (17 * dc + 64) >> 7; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
589 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
590 "movd %0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
591 "pshufw $0, %%mm0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
592 "pxor %%mm1, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
593 "psubw %%mm0, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
594 "packuswb %%mm0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
595 "packuswb %%mm1, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
596 ::"r"(dc) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
597 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
598 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
599 "movq %0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
600 "movq %1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
601 "movq %2, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
602 "movq %3, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
603 "paddusb %%mm0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
604 "paddusb %%mm0, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
605 "paddusb %%mm0, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
606 "paddusb %%mm0, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
607 "psubusb %%mm1, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
608 "psubusb %%mm1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
609 "psubusb %%mm1, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
610 "psubusb %%mm1, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
611 "movq %%mm2, %0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
612 "movq %%mm3, %1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
613 "movq %%mm4, %2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
614 "movq %%mm5, %3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
615 :"+m"(*(uint32_t*)(dest+0*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
616 "+m"(*(uint32_t*)(dest+1*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
617 "+m"(*(uint32_t*)(dest+2*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
618 "+m"(*(uint32_t*)(dest+3*linesize)) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
619 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
620 } |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
621 |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
622 static void vc1_inv_trans_8x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
623 { |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
624 int dc = block[0]; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
625 dc = (3 * dc + 1) >> 1; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
626 dc = (3 * dc + 16) >> 5; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
627 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
628 "movd %0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
629 "pshufw $0, %%mm0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
630 "pxor %%mm1, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
631 "psubw %%mm0, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
632 "packuswb %%mm0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
633 "packuswb %%mm1, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
634 ::"r"(dc) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
635 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
636 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
637 "movq %0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
638 "movq %1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
639 "movq %2, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
640 "movq %3, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
641 "paddusb %%mm0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
642 "paddusb %%mm0, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
643 "paddusb %%mm0, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
644 "paddusb %%mm0, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
645 "psubusb %%mm1, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
646 "psubusb %%mm1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
647 "psubusb %%mm1, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
648 "psubusb %%mm1, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
649 "movq %%mm2, %0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
650 "movq %%mm3, %1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
651 "movq %%mm4, %2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
652 "movq %%mm5, %3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
653 :"+m"(*(uint32_t*)(dest+0*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
654 "+m"(*(uint32_t*)(dest+1*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
655 "+m"(*(uint32_t*)(dest+2*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
656 "+m"(*(uint32_t*)(dest+3*linesize)) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
657 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
658 dest += 4*linesize; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
659 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
660 "movq %0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
661 "movq %1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
662 "movq %2, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
663 "movq %3, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
664 "paddusb %%mm0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
665 "paddusb %%mm0, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
666 "paddusb %%mm0, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
667 "paddusb %%mm0, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
668 "psubusb %%mm1, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
669 "psubusb %%mm1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
670 "psubusb %%mm1, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
671 "psubusb %%mm1, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
672 "movq %%mm2, %0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
673 "movq %%mm3, %1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
674 "movq %%mm4, %2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
675 "movq %%mm5, %3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
676 :"+m"(*(uint32_t*)(dest+0*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
677 "+m"(*(uint32_t*)(dest+1*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
678 "+m"(*(uint32_t*)(dest+2*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
679 "+m"(*(uint32_t*)(dest+3*linesize)) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
680 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
681 } |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
682 |
12144 | 683 #define LOOP_FILTER(EXT) \ |
684 void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \ | |
685 void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \ | |
686 void ff_vc1_v_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \ | |
687 void ff_vc1_h_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \ | |
688 \ | |
689 static void vc1_v_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \ | |
690 { \ | |
691 ff_vc1_v_loop_filter8_ ## EXT(src, stride, pq); \ | |
692 ff_vc1_v_loop_filter8_ ## EXT(src+8, stride, pq); \ | |
693 } \ | |
694 \ | |
695 static void vc1_h_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \ | |
696 { \ | |
697 ff_vc1_h_loop_filter8_ ## EXT(src, stride, pq); \ | |
698 ff_vc1_h_loop_filter8_ ## EXT(src+8*stride, stride, pq); \ | |
699 } | |
700 | |
701 #if HAVE_YASM | |
702 LOOP_FILTER(mmx) | |
703 LOOP_FILTER(mmx2) | |
704 LOOP_FILTER(sse2) | |
705 LOOP_FILTER(ssse3) | |
706 | |
707 void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq); | |
708 | |
709 static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq) | |
710 { | |
711 ff_vc1_h_loop_filter8_sse4(src, stride, pq); | |
712 ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq); | |
713 } | |
714 #endif | |
715 | |
8430 | 716 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) { |
12414 | 717 int mm_flags = mm_support(); |
9441 | 718 |
8430 | 719 dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx; |
720 dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx; | |
721 dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx; | |
722 dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx; | |
723 | |
724 dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx; | |
725 dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx; | |
726 dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx; | |
727 dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx; | |
728 | |
729 dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx; | |
730 dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx; | |
731 dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx; | |
732 dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx; | |
733 | |
734 dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx; | |
735 dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx; | |
736 dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx; | |
737 dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx; | |
9441 | 738 |
739 if (mm_flags & FF_MM_MMX2){ | |
740 dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_mmx2; | |
741 dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmx2; | |
742 dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmx2; | |
743 dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmx2; | |
744 | |
745 dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmx2; | |
746 dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmx2; | |
747 dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmx2; | |
748 dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmx2; | |
749 | |
750 dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmx2; | |
751 dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmx2; | |
752 dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmx2; | |
753 dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmx2; | |
754 | |
755 dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmx2; | |
756 dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2; | |
757 dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2; | |
758 dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2; | |
9859
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
759 |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
760 dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmx2; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
761 dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
762 dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
763 dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2; |
9441 | 764 } |
12144 | 765 |
766 #define ASSIGN_LF(EXT) \ | |
767 dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_ ## EXT; \ | |
768 dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_ ## EXT; \ | |
769 dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_ ## EXT; \ | |
770 dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_ ## EXT; \ | |
771 dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_ ## EXT; \ | |
772 dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT | |
773 | |
774 #if HAVE_YASM | |
775 if (mm_flags & FF_MM_MMX) { | |
776 ASSIGN_LF(mmx); | |
777 } | |
778 return; | |
779 if (mm_flags & FF_MM_MMX2) { | |
780 ASSIGN_LF(mmx2); | |
781 } | |
782 if (mm_flags & FF_MM_SSE2) { | |
783 dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2; | |
784 dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2; | |
785 dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2; | |
786 dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2; | |
787 } | |
788 if (mm_flags & FF_MM_SSSE3) { | |
789 ASSIGN_LF(ssse3); | |
790 } | |
791 if (mm_flags & FF_MM_SSE4) { | |
792 dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4; | |
793 dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4; | |
794 } | |
795 #endif | |
8430 | 796 } |