Mercurial > libavcodec.hg
annotate x86/vc1dsp_mmx.c @ 9859:7a116de63777 libavcodec
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
Includes mmx2 asm for the various functions.
Note that the actual idct still does not have an x86 SIMD implemtation.
For wmv3 files using regular idct, the decoder just falls back to simple_idct,
since simple_idct_dc doesn't exist (yet).
author | darkshikari |
---|---|
date | Tue, 16 Jun 2009 09:00:55 +0000 |
parents | e14cd3ac3806 |
children | 34a65026fa06 |
rev | line source |
---|---|
8430 | 1 /* |
2 * VC-1 and WMV3 - DSP functions MMX-optimized | |
3 * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> | |
4 * | |
5 * Permission is hereby granted, free of charge, to any person | |
6 * obtaining a copy of this software and associated documentation | |
7 * files (the "Software"), to deal in the Software without | |
8 * restriction, including without limitation the rights to use, | |
9 * copy, modify, merge, publish, distribute, sublicense, and/or sell | |
10 * copies of the Software, and to permit persons to whom the | |
11 * Software is furnished to do so, subject to the following | |
12 * conditions: | |
13 * | |
14 * The above copyright notice and this permission notice shall be | |
15 * included in all copies or substantial portions of the Software. | |
16 * | |
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | |
19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
24 * OTHER DEALINGS IN THE SOFTWARE. | |
25 */ | |
26 | |
27 #include "libavutil/x86_cpu.h" | |
28 #include "libavcodec/dsputil.h" | |
29 #include "dsputil_mmx.h" | |
30 | |
9441 | 31 #define OP_PUT(S,D) |
32 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t" | |
33 | |
8430 | 34 /** Add rounder from mm7 to mm3 and pack result at destination */ |
35 #define NORMALIZE_MMX(SHIFT) \ | |
36 "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \ | |
37 "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \ | |
38 "psraw "SHIFT", %%mm3 \n\t" \ | |
39 "psraw "SHIFT", %%mm4 \n\t" | |
40 | |
9441 | 41 #define TRANSFER_DO_PACK(OP) \ |
8430 | 42 "packuswb %%mm4, %%mm3 \n\t" \ |
9441 | 43 OP((%2), %%mm3) \ |
8430 | 44 "movq %%mm3, (%2) \n\t" |
45 | |
9441 | 46 #define TRANSFER_DONT_PACK(OP) \ |
47 OP(0(%2), %%mm3) \ | |
48 OP(8(%2), %%mm4) \ | |
8430 | 49 "movq %%mm3, 0(%2) \n\t" \ |
50 "movq %%mm4, 8(%2) \n\t" | |
51 | |
52 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */ | |
53 #define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t" | |
54 #define DONT_UNPACK(reg) | |
55 | |
56 /** Compute the rounder 32-r or 8-r and unpacks it to mm7 */ | |
57 #define LOAD_ROUNDER_MMX(ROUND) \ | |
58 "movd "ROUND", %%mm7 \n\t" \ | |
59 "punpcklwd %%mm7, %%mm7 \n\t" \ | |
60 "punpckldq %%mm7, %%mm7 \n\t" | |
61 | |
62 #define SHIFT2_LINE(OFF, R0,R1,R2,R3) \ | |
63 "paddw %%mm"#R2", %%mm"#R1" \n\t" \ | |
64 "movd (%0,%3), %%mm"#R0" \n\t" \ | |
65 "pmullw %%mm6, %%mm"#R1" \n\t" \ | |
66 "punpcklbw %%mm0, %%mm"#R0" \n\t" \ | |
67 "movd (%0,%2), %%mm"#R3" \n\t" \ | |
68 "psubw %%mm"#R0", %%mm"#R1" \n\t" \ | |
69 "punpcklbw %%mm0, %%mm"#R3" \n\t" \ | |
70 "paddw %%mm7, %%mm"#R1" \n\t" \ | |
71 "psubw %%mm"#R3", %%mm"#R1" \n\t" \ | |
72 "psraw %4, %%mm"#R1" \n\t" \ | |
73 "movq %%mm"#R1", "#OFF"(%1) \n\t" \ | |
74 "add %2, %0 \n\t" | |
75 | |
76 DECLARE_ALIGNED_16(const uint64_t, ff_pw_9) = 0x0009000900090009ULL; | |
77 | |
78 /** Sacrifying mm6 allows to pipeline loads from src */ | |
79 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, | |
80 const uint8_t *src, x86_reg stride, | |
81 int rnd, int64_t shift) | |
82 { | |
83 __asm__ volatile( | |
84 "mov $3, %%"REG_c" \n\t" | |
85 LOAD_ROUNDER_MMX("%5") | |
86 "movq "MANGLE(ff_pw_9)", %%mm6 \n\t" | |
87 "1: \n\t" | |
88 "movd (%0), %%mm2 \n\t" | |
89 "add %2, %0 \n\t" | |
90 "movd (%0), %%mm3 \n\t" | |
91 "punpcklbw %%mm0, %%mm2 \n\t" | |
92 "punpcklbw %%mm0, %%mm3 \n\t" | |
93 SHIFT2_LINE( 0, 1, 2, 3, 4) | |
94 SHIFT2_LINE( 24, 2, 3, 4, 1) | |
95 SHIFT2_LINE( 48, 3, 4, 1, 2) | |
96 SHIFT2_LINE( 72, 4, 1, 2, 3) | |
97 SHIFT2_LINE( 96, 1, 2, 3, 4) | |
98 SHIFT2_LINE(120, 2, 3, 4, 1) | |
99 SHIFT2_LINE(144, 3, 4, 1, 2) | |
100 SHIFT2_LINE(168, 4, 1, 2, 3) | |
101 "sub %6, %0 \n\t" | |
102 "add $8, %1 \n\t" | |
103 "dec %%"REG_c" \n\t" | |
104 "jnz 1b \n\t" | |
105 : "+r"(src), "+r"(dst) | |
106 : "r"(stride), "r"(-2*stride), | |
107 "m"(shift), "m"(rnd), "r"(9*stride-4) | |
108 : "%"REG_c, "memory" | |
109 ); | |
110 } | |
111 | |
112 /** | |
113 * Data is already unpacked, so some operations can directly be made from | |
114 * memory. | |
115 */ | |
9441 | 116 #define VC1_HOR_16b_SHIFT2(OP, OPNAME)\ |
117 static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\ | |
118 const int16_t *src, int rnd)\ | |
119 {\ | |
120 int h = 8;\ | |
121 \ | |
122 src -= 1;\ | |
123 rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\ | |
124 __asm__ volatile(\ | |
125 LOAD_ROUNDER_MMX("%4")\ | |
126 "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\ | |
127 "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\ | |
128 "1: \n\t"\ | |
129 "movq 2*0+0(%1), %%mm1 \n\t"\ | |
130 "movq 2*0+8(%1), %%mm2 \n\t"\ | |
131 "movq 2*1+0(%1), %%mm3 \n\t"\ | |
132 "movq 2*1+8(%1), %%mm4 \n\t"\ | |
133 "paddw 2*3+0(%1), %%mm1 \n\t"\ | |
134 "paddw 2*3+8(%1), %%mm2 \n\t"\ | |
135 "paddw 2*2+0(%1), %%mm3 \n\t"\ | |
136 "paddw 2*2+8(%1), %%mm4 \n\t"\ | |
137 "pmullw %%mm5, %%mm3 \n\t"\ | |
138 "pmullw %%mm5, %%mm4 \n\t"\ | |
139 "psubw %%mm1, %%mm3 \n\t"\ | |
140 "psubw %%mm2, %%mm4 \n\t"\ | |
141 NORMALIZE_MMX("$7")\ | |
142 /* Remove bias */\ | |
143 "paddw %%mm6, %%mm3 \n\t"\ | |
144 "paddw %%mm6, %%mm4 \n\t"\ | |
145 TRANSFER_DO_PACK(OP)\ | |
146 "add $24, %1 \n\t"\ | |
147 "add %3, %2 \n\t"\ | |
148 "decl %0 \n\t"\ | |
149 "jnz 1b \n\t"\ | |
150 : "+r"(h), "+r" (src), "+r" (dst)\ | |
151 : "r"(stride), "m"(rnd)\ | |
152 : "memory"\ | |
153 );\ | |
154 } | |
8430 | 155 |
9441 | 156 VC1_HOR_16b_SHIFT2(OP_PUT, put_) |
157 VC1_HOR_16b_SHIFT2(OP_AVG, avg_) | |
8430 | 158 |
159 | |
160 /** | |
161 * Purely vertical or horizontal 1/2 shift interpolation. | |
162 * Sacrify mm6 for *9 factor. | |
163 */ | |
9441 | 164 #define VC1_SHIFT2(OP, OPNAME)\ |
165 static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\ | |
166 x86_reg stride, int rnd, x86_reg offset)\ | |
167 {\ | |
168 rnd = 8-rnd;\ | |
169 __asm__ volatile(\ | |
170 "mov $8, %%"REG_c" \n\t"\ | |
171 LOAD_ROUNDER_MMX("%5")\ | |
172 "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\ | |
173 "1: \n\t"\ | |
174 "movd 0(%0 ), %%mm3 \n\t"\ | |
175 "movd 4(%0 ), %%mm4 \n\t"\ | |
176 "movd 0(%0,%2), %%mm1 \n\t"\ | |
177 "movd 4(%0,%2), %%mm2 \n\t"\ | |
178 "add %2, %0 \n\t"\ | |
179 "punpcklbw %%mm0, %%mm3 \n\t"\ | |
180 "punpcklbw %%mm0, %%mm4 \n\t"\ | |
181 "punpcklbw %%mm0, %%mm1 \n\t"\ | |
182 "punpcklbw %%mm0, %%mm2 \n\t"\ | |
183 "paddw %%mm1, %%mm3 \n\t"\ | |
184 "paddw %%mm2, %%mm4 \n\t"\ | |
185 "movd 0(%0,%3), %%mm1 \n\t"\ | |
186 "movd 4(%0,%3), %%mm2 \n\t"\ | |
187 "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\ | |
188 "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\ | |
189 "punpcklbw %%mm0, %%mm1 \n\t"\ | |
190 "punpcklbw %%mm0, %%mm2 \n\t"\ | |
191 "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\ | |
192 "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\ | |
193 "movd 0(%0,%2), %%mm1 \n\t"\ | |
194 "movd 4(%0,%2), %%mm2 \n\t"\ | |
195 "punpcklbw %%mm0, %%mm1 \n\t"\ | |
196 "punpcklbw %%mm0, %%mm2 \n\t"\ | |
197 "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\ | |
198 "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\ | |
199 NORMALIZE_MMX("$4")\ | |
200 "packuswb %%mm4, %%mm3 \n\t"\ | |
201 OP((%1), %%mm3)\ | |
202 "movq %%mm3, (%1) \n\t"\ | |
203 "add %6, %0 \n\t"\ | |
204 "add %4, %1 \n\t"\ | |
205 "dec %%"REG_c" \n\t"\ | |
206 "jnz 1b \n\t"\ | |
207 : "+r"(src), "+r"(dst)\ | |
208 : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\ | |
209 "g"(stride-offset)\ | |
210 : "%"REG_c, "memory"\ | |
211 );\ | |
8430 | 212 } |
213 | |
9441 | 214 VC1_SHIFT2(OP_PUT, put_) |
215 VC1_SHIFT2(OP_AVG, avg_) | |
216 | |
8430 | 217 /** |
218 * Filter coefficients made global to allow access by all 1 or 3 quarter shift | |
219 * interpolation functions. | |
220 */ | |
221 DECLARE_ASM_CONST(16, uint64_t, ff_pw_53) = 0x0035003500350035ULL; | |
222 DECLARE_ASM_CONST(16, uint64_t, ff_pw_18) = 0x0012001200120012ULL; | |
223 | |
224 /** | |
225 * Core of the 1/4 and 3/4 shift bicubic interpolation. | |
226 * | |
227 * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty). | |
228 * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked. | |
229 * @param A1 Address of 1st tap (beware of unpacked/packed). | |
230 * @param A2 Address of 2nd tap | |
231 * @param A3 Address of 3rd tap | |
232 * @param A4 Address of 4th tap | |
233 */ | |
234 #define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \ | |
235 MOVQ "*0+"A1", %%mm1 \n\t" \ | |
236 MOVQ "*4+"A1", %%mm2 \n\t" \ | |
237 UNPACK("%%mm1") \ | |
238 UNPACK("%%mm2") \ | |
239 "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \ | |
240 "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \ | |
241 MOVQ "*0+"A2", %%mm3 \n\t" \ | |
242 MOVQ "*4+"A2", %%mm4 \n\t" \ | |
243 UNPACK("%%mm3") \ | |
244 UNPACK("%%mm4") \ | |
245 "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ | |
246 "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \ | |
247 "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \ | |
248 "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \ | |
249 MOVQ "*0+"A4", %%mm1 \n\t" \ | |
250 MOVQ "*4+"A4", %%mm2 \n\t" \ | |
251 UNPACK("%%mm1") \ | |
252 UNPACK("%%mm2") \ | |
253 "psllw $2, %%mm1 \n\t" /* 4* */ \ | |
254 "psllw $2, %%mm2 \n\t" /* 4* */ \ | |
255 "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \ | |
256 "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \ | |
257 MOVQ "*0+"A3", %%mm1 \n\t" \ | |
258 MOVQ "*4+"A3", %%mm2 \n\t" \ | |
259 UNPACK("%%mm1") \ | |
260 UNPACK("%%mm2") \ | |
261 "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ | |
262 "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \ | |
263 "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \ | |
264 "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */ | |
265 | |
266 /** | |
267 * Macro to build the vertical 16bits version of vc1_put_shift[13]. | |
268 * Here, offset=src_stride. Parameters passed A1 to A4 must use | |
269 * %3 (src_stride) and %4 (3*src_stride). | |
270 * | |
271 * @param NAME Either 1 or 3 | |
272 * @see MSPEL_FILTER13_CORE for information on A1->A4 | |
273 */ | |
274 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \ | |
275 static void \ | |
276 vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \ | |
277 x86_reg src_stride, \ | |
278 int rnd, int64_t shift) \ | |
279 { \ | |
280 int h = 8; \ | |
281 src -= src_stride; \ | |
282 __asm__ volatile( \ | |
283 LOAD_ROUNDER_MMX("%5") \ | |
284 "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \ | |
285 "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \ | |
286 ASMALIGN(3) \ | |
287 "1: \n\t" \ | |
288 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ | |
289 NORMALIZE_MMX("%6") \ | |
9441 | 290 TRANSFER_DONT_PACK(OP_PUT) \ |
8430 | 291 /* Last 3 (in fact 4) bytes on the line */ \ |
292 "movd 8+"A1", %%mm1 \n\t" \ | |
293 DO_UNPACK("%%mm1") \ | |
294 "movq %%mm1, %%mm3 \n\t" \ | |
295 "paddw %%mm1, %%mm1 \n\t" \ | |
296 "paddw %%mm3, %%mm1 \n\t" /* 3* */ \ | |
297 "movd 8+"A2", %%mm3 \n\t" \ | |
298 DO_UNPACK("%%mm3") \ | |
299 "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ | |
300 "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \ | |
301 "movd 8+"A3", %%mm1 \n\t" \ | |
302 DO_UNPACK("%%mm1") \ | |
303 "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ | |
304 "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \ | |
305 "movd 8+"A4", %%mm1 \n\t" \ | |
306 DO_UNPACK("%%mm1") \ | |
307 "psllw $2, %%mm1 \n\t" /* 4* */ \ | |
308 "psubw %%mm1, %%mm3 \n\t" \ | |
309 "paddw %%mm7, %%mm3 \n\t" \ | |
310 "psraw %6, %%mm3 \n\t" \ | |
311 "movq %%mm3, 16(%2) \n\t" \ | |
312 "add %3, %1 \n\t" \ | |
313 "add $24, %2 \n\t" \ | |
314 "decl %0 \n\t" \ | |
315 "jnz 1b \n\t" \ | |
316 : "+r"(h), "+r" (src), "+r" (dst) \ | |
317 : "r"(src_stride), "r"(3*src_stride), \ | |
318 "m"(rnd), "m"(shift) \ | |
319 : "memory" \ | |
320 ); \ | |
321 } | |
322 | |
323 /** | |
324 * Macro to build the horizontal 16bits version of vc1_put_shift[13]. | |
325 * Here, offset=16bits, so parameters passed A1 to A4 should be simple. | |
326 * | |
327 * @param NAME Either 1 or 3 | |
328 * @see MSPEL_FILTER13_CORE for information on A1->A4 | |
329 */ | |
9441 | 330 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \ |
8430 | 331 static void \ |
9441 | 332 OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \ |
8430 | 333 const int16_t *src, int rnd) \ |
334 { \ | |
335 int h = 8; \ | |
336 src -= 1; \ | |
337 rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ | |
338 __asm__ volatile( \ | |
339 LOAD_ROUNDER_MMX("%4") \ | |
340 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ | |
341 "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ | |
342 ASMALIGN(3) \ | |
343 "1: \n\t" \ | |
344 MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \ | |
345 NORMALIZE_MMX("$7") \ | |
346 /* Remove bias */ \ | |
347 "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \ | |
348 "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \ | |
9441 | 349 TRANSFER_DO_PACK(OP) \ |
8430 | 350 "add $24, %1 \n\t" \ |
351 "add %3, %2 \n\t" \ | |
352 "decl %0 \n\t" \ | |
353 "jnz 1b \n\t" \ | |
354 : "+r"(h), "+r" (src), "+r" (dst) \ | |
355 : "r"(stride), "m"(rnd) \ | |
356 : "memory" \ | |
357 ); \ | |
358 } | |
359 | |
360 /** | |
361 * Macro to build the 8bits, any direction, version of vc1_put_shift[13]. | |
362 * Here, offset=src_stride. Parameters passed A1 to A4 must use | |
363 * %3 (offset) and %4 (3*offset). | |
364 * | |
365 * @param NAME Either 1 or 3 | |
366 * @see MSPEL_FILTER13_CORE for information on A1->A4 | |
367 */ | |
9441 | 368 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \ |
8430 | 369 static void \ |
9441 | 370 OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \ |
8430 | 371 x86_reg stride, int rnd, x86_reg offset) \ |
372 { \ | |
373 int h = 8; \ | |
374 src -= offset; \ | |
375 rnd = 32-rnd; \ | |
376 __asm__ volatile ( \ | |
377 LOAD_ROUNDER_MMX("%6") \ | |
378 "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ | |
379 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ | |
380 ASMALIGN(3) \ | |
381 "1: \n\t" \ | |
382 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ | |
383 NORMALIZE_MMX("$6") \ | |
9441 | 384 TRANSFER_DO_PACK(OP) \ |
8430 | 385 "add %5, %1 \n\t" \ |
386 "add %5, %2 \n\t" \ | |
387 "decl %0 \n\t" \ | |
388 "jnz 1b \n\t" \ | |
389 : "+r"(h), "+r" (src), "+r" (dst) \ | |
390 : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \ | |
391 : "memory" \ | |
392 ); \ | |
393 } | |
394 | |
395 /** 1/4 shift bicubic interpolation */ | |
9441 | 396 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_) |
397 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_) | |
8430 | 398 MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") |
9441 | 399 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_) |
400 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_) | |
8430 | 401 |
402 /** 3/4 shift bicubic interpolation */ | |
9441 | 403 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_) |
404 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_) | |
8430 | 405 MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") |
9441 | 406 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_) |
407 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_) | |
8430 | 408 |
409 typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift); | |
410 typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd); | |
411 typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset); | |
412 | |
413 /** | |
414 * Interpolates fractional pel values by applying proper vertical then | |
415 * horizontal filter. | |
416 * | |
417 * @param dst Destination buffer for interpolated pels. | |
418 * @param src Source buffer. | |
419 * @param stride Stride for both src and dst buffers. | |
420 * @param hmode Horizontal filter (expressed in quarter pixels shift). | |
421 * @param hmode Vertical filter. | |
422 * @param rnd Rounding bias. | |
423 */ | |
9441 | 424 #define VC1_MSPEL_MC(OP)\ |
425 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ | |
426 int hmode, int vmode, int rnd)\ | |
427 {\ | |
428 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ | |
429 { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\ | |
430 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ | |
431 { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\ | |
432 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\ | |
433 { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\ | |
434 \ | |
435 __asm__ volatile(\ | |
436 "pxor %%mm0, %%mm0 \n\t"\ | |
437 ::: "memory"\ | |
438 );\ | |
439 \ | |
440 if (vmode) { /* Vertical filter to apply */\ | |
441 if (hmode) { /* Horizontal filter to apply, output to tmp */\ | |
442 static const int shift_value[] = { 0, 5, 1, 5 };\ | |
443 int shift = (shift_value[hmode]+shift_value[vmode])>>1;\ | |
444 int r;\ | |
445 DECLARE_ALIGNED_16(int16_t, tmp[12*8]);\ | |
446 \ | |
447 r = (1<<(shift-1)) + rnd-1;\ | |
448 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\ | |
449 \ | |
450 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ | |
451 return;\ | |
452 }\ | |
453 else { /* No horizontal filter, output 8 lines to dst */\ | |
454 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\ | |
455 return;\ | |
456 }\ | |
457 }\ | |
458 \ | |
459 /* Horizontal mode with no vertical mode */\ | |
460 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ | |
8430 | 461 } |
462 | |
9441 | 463 VC1_MSPEL_MC(put_) |
464 VC1_MSPEL_MC(avg_) | |
465 | |
8430 | 466 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd); |
9441 | 467 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd); |
8430 | 468 |
469 /** Macro to ease bicubic filter interpolation functions declarations */ | |
470 #define DECLARE_FUNCTION(a, b) \ | |
471 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \ | |
9441 | 472 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ |
473 }\ | |
474 static void avg_vc1_mspel_mc ## a ## b ## _mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \ | |
475 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ | |
8430 | 476 } |
477 | |
478 DECLARE_FUNCTION(0, 1) | |
479 DECLARE_FUNCTION(0, 2) | |
480 DECLARE_FUNCTION(0, 3) | |
481 | |
482 DECLARE_FUNCTION(1, 0) | |
483 DECLARE_FUNCTION(1, 1) | |
484 DECLARE_FUNCTION(1, 2) | |
485 DECLARE_FUNCTION(1, 3) | |
486 | |
487 DECLARE_FUNCTION(2, 0) | |
488 DECLARE_FUNCTION(2, 1) | |
489 DECLARE_FUNCTION(2, 2) | |
490 DECLARE_FUNCTION(2, 3) | |
491 | |
492 DECLARE_FUNCTION(3, 0) | |
493 DECLARE_FUNCTION(3, 1) | |
494 DECLARE_FUNCTION(3, 2) | |
495 DECLARE_FUNCTION(3, 3) | |
496 | |
9859
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
497 static void vc1_inv_trans_4x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
498 { |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
499 int dc = block[0]; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
500 dc = (17 * dc + 4) >> 3; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
501 dc = (17 * dc + 64) >> 7; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
502 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
503 "movd %0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
504 "pshufw $0, %%mm0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
505 "pxor %%mm1, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
506 "psubw %%mm0, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
507 "packuswb %%mm0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
508 "packuswb %%mm1, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
509 ::"r"(dc) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
510 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
511 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
512 "movd %0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
513 "movd %1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
514 "movd %2, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
515 "movd %3, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
516 "paddusb %%mm0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
517 "paddusb %%mm0, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
518 "paddusb %%mm0, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
519 "paddusb %%mm0, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
520 "psubusb %%mm1, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
521 "psubusb %%mm1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
522 "psubusb %%mm1, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
523 "psubusb %%mm1, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
524 "movd %%mm2, %0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
525 "movd %%mm3, %1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
526 "movd %%mm4, %2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
527 "movd %%mm5, %3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
528 :"+m"(*(uint32_t*)(dest+0*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
529 "+m"(*(uint32_t*)(dest+1*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
530 "+m"(*(uint32_t*)(dest+2*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
531 "+m"(*(uint32_t*)(dest+3*linesize)) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
532 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
533 } |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
534 |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
535 static void vc1_inv_trans_4x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
536 { |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
537 int dc = block[0]; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
538 dc = (17 * dc + 4) >> 3; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
539 dc = (12 * dc + 64) >> 7; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
540 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
541 "movd %0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
542 "pshufw $0, %%mm0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
543 "pxor %%mm1, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
544 "psubw %%mm0, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
545 "packuswb %%mm0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
546 "packuswb %%mm1, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
547 ::"r"(dc) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
548 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
549 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
550 "movd %0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
551 "movd %1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
552 "movd %2, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
553 "movd %3, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
554 "paddusb %%mm0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
555 "paddusb %%mm0, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
556 "paddusb %%mm0, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
557 "paddusb %%mm0, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
558 "psubusb %%mm1, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
559 "psubusb %%mm1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
560 "psubusb %%mm1, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
561 "psubusb %%mm1, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
562 "movd %%mm2, %0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
563 "movd %%mm3, %1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
564 "movd %%mm4, %2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
565 "movd %%mm5, %3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
566 :"+m"(*(uint32_t*)(dest+0*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
567 "+m"(*(uint32_t*)(dest+1*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
568 "+m"(*(uint32_t*)(dest+2*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
569 "+m"(*(uint32_t*)(dest+3*linesize)) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
570 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
571 dest += 4*linesize; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
572 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
573 "movd %0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
574 "movd %1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
575 "movd %2, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
576 "movd %3, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
577 "paddusb %%mm0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
578 "paddusb %%mm0, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
579 "paddusb %%mm0, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
580 "paddusb %%mm0, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
581 "psubusb %%mm1, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
582 "psubusb %%mm1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
583 "psubusb %%mm1, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
584 "psubusb %%mm1, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
585 "movd %%mm2, %0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
586 "movd %%mm3, %1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
587 "movd %%mm4, %2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
588 "movd %%mm5, %3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
589 :"+m"(*(uint32_t*)(dest+0*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
590 "+m"(*(uint32_t*)(dest+1*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
591 "+m"(*(uint32_t*)(dest+2*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
592 "+m"(*(uint32_t*)(dest+3*linesize)) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
593 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
594 } |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
595 |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
596 static void vc1_inv_trans_8x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
597 { |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
598 int dc = block[0]; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
599 dc = ( 3 * dc + 1) >> 1; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
600 dc = (17 * dc + 64) >> 7; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
601 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
602 "movd %0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
603 "pshufw $0, %%mm0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
604 "pxor %%mm1, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
605 "psubw %%mm0, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
606 "packuswb %%mm0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
607 "packuswb %%mm1, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
608 ::"r"(dc) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
609 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
610 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
611 "movq %0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
612 "movq %1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
613 "movq %2, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
614 "movq %3, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
615 "paddusb %%mm0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
616 "paddusb %%mm0, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
617 "paddusb %%mm0, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
618 "paddusb %%mm0, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
619 "psubusb %%mm1, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
620 "psubusb %%mm1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
621 "psubusb %%mm1, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
622 "psubusb %%mm1, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
623 "movq %%mm2, %0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
624 "movq %%mm3, %1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
625 "movq %%mm4, %2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
626 "movq %%mm5, %3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
627 :"+m"(*(uint32_t*)(dest+0*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
628 "+m"(*(uint32_t*)(dest+1*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
629 "+m"(*(uint32_t*)(dest+2*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
630 "+m"(*(uint32_t*)(dest+3*linesize)) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
631 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
632 } |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
633 |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
634 static void vc1_inv_trans_8x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
635 { |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
636 int dc = block[0]; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
637 dc = (3 * dc + 1) >> 1; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
638 dc = (3 * dc + 16) >> 5; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
639 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
640 "movd %0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
641 "pshufw $0, %%mm0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
642 "pxor %%mm1, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
643 "psubw %%mm0, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
644 "packuswb %%mm0, %%mm0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
645 "packuswb %%mm1, %%mm1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
646 ::"r"(dc) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
647 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
648 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
649 "movq %0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
650 "movq %1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
651 "movq %2, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
652 "movq %3, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
653 "paddusb %%mm0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
654 "paddusb %%mm0, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
655 "paddusb %%mm0, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
656 "paddusb %%mm0, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
657 "psubusb %%mm1, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
658 "psubusb %%mm1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
659 "psubusb %%mm1, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
660 "psubusb %%mm1, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
661 "movq %%mm2, %0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
662 "movq %%mm3, %1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
663 "movq %%mm4, %2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
664 "movq %%mm5, %3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
665 :"+m"(*(uint32_t*)(dest+0*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
666 "+m"(*(uint32_t*)(dest+1*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
667 "+m"(*(uint32_t*)(dest+2*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
668 "+m"(*(uint32_t*)(dest+3*linesize)) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
669 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
670 dest += 4*linesize; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
671 __asm__ volatile( |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
672 "movq %0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
673 "movq %1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
674 "movq %2, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
675 "movq %3, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
676 "paddusb %%mm0, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
677 "paddusb %%mm0, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
678 "paddusb %%mm0, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
679 "paddusb %%mm0, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
680 "psubusb %%mm1, %%mm2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
681 "psubusb %%mm1, %%mm3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
682 "psubusb %%mm1, %%mm4 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
683 "psubusb %%mm1, %%mm5 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
684 "movq %%mm2, %0 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
685 "movq %%mm3, %1 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
686 "movq %%mm4, %2 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
687 "movq %%mm5, %3 \n\t" |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
688 :"+m"(*(uint32_t*)(dest+0*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
689 "+m"(*(uint32_t*)(dest+1*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
690 "+m"(*(uint32_t*)(dest+2*linesize)), |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
691 "+m"(*(uint32_t*)(dest+3*linesize)) |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
692 ); |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
693 } |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
694 |
8430 | 695 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) { |
9441 | 696 mm_flags = mm_support(); |
697 | |
8430 | 698 dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx; |
699 dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx; | |
700 dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx; | |
701 dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx; | |
702 | |
703 dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx; | |
704 dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx; | |
705 dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx; | |
706 dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx; | |
707 | |
708 dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx; | |
709 dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx; | |
710 dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx; | |
711 dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx; | |
712 | |
713 dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx; | |
714 dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx; | |
715 dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx; | |
716 dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx; | |
9441 | 717 |
718 if (mm_flags & FF_MM_MMX2){ | |
719 dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_mmx2; | |
720 dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmx2; | |
721 dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmx2; | |
722 dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmx2; | |
723 | |
724 dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmx2; | |
725 dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmx2; | |
726 dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmx2; | |
727 dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmx2; | |
728 | |
729 dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmx2; | |
730 dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmx2; | |
731 dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmx2; | |
732 dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmx2; | |
733 | |
734 dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmx2; | |
735 dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2; | |
736 dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2; | |
737 dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2; | |
9859
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
738 |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
739 dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmx2; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
740 dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
741 dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2; |
7a116de63777
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents:
9441
diff
changeset
|
742 dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2; |
9441 | 743 } |
8430 | 744 } |