annotate x86/vc1dsp_mmx.c @ 10952:ea8f891d997d libavcodec

H264 DXVA2 implementation It allows VLD H264 decoding using DXVA2 (GPU assisted decoding API under VISTA and Windows 7). It is implemented by using AVHWAccel API. It has been tested successfully for some time in VLC using an nvidia card on Windows 7. To compile it, you need to have the system header dxva2api.h (either from microsoft or using http://downloads.videolan.org/pub/videolan/testing/contrib/dxva2api.h) The generated libavcodec.dll does not depend directly on any new lib as the necessary objects are given by the application using FFmpeg.
author fenrir
date Wed, 20 Jan 2010 18:54:51 +0000
parents 7a116de63777
children 34a65026fa06
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
1 /*
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
2 * VC-1 and WMV3 - DSP functions MMX-optimized
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
3 * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
4 *
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
5 * Permission is hereby granted, free of charge, to any person
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
6 * obtaining a copy of this software and associated documentation
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
7 * files (the "Software"), to deal in the Software without
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
8 * restriction, including without limitation the rights to use,
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
9 * copy, modify, merge, publish, distribute, sublicense, and/or sell
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
10 * copies of the Software, and to permit persons to whom the
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
11 * Software is furnished to do so, subject to the following
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
12 * conditions:
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
13 *
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
14 * The above copyright notice and this permission notice shall be
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
15 * included in all copies or substantial portions of the Software.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
16 *
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
24 * OTHER DEALINGS IN THE SOFTWARE.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
25 */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
26
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
27 #include "libavutil/x86_cpu.h"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
28 #include "libavcodec/dsputil.h"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
29 #include "dsputil_mmx.h"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
30
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
31 #define OP_PUT(S,D)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
32 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
33
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
34 /** Add rounder from mm7 to mm3 and pack result at destination */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
35 #define NORMALIZE_MMX(SHIFT) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
36 "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
37 "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
38 "psraw "SHIFT", %%mm3 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
39 "psraw "SHIFT", %%mm4 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
40
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
41 #define TRANSFER_DO_PACK(OP) \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
42 "packuswb %%mm4, %%mm3 \n\t" \
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
43 OP((%2), %%mm3) \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
44 "movq %%mm3, (%2) \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
45
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
46 #define TRANSFER_DONT_PACK(OP) \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
47 OP(0(%2), %%mm3) \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
48 OP(8(%2), %%mm4) \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
49 "movq %%mm3, 0(%2) \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
50 "movq %%mm4, 8(%2) \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
51
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
52 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
53 #define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
54 #define DONT_UNPACK(reg)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
55
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
56 /** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
57 #define LOAD_ROUNDER_MMX(ROUND) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
58 "movd "ROUND", %%mm7 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
59 "punpcklwd %%mm7, %%mm7 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
60 "punpckldq %%mm7, %%mm7 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
61
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
62 #define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
63 "paddw %%mm"#R2", %%mm"#R1" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
64 "movd (%0,%3), %%mm"#R0" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
65 "pmullw %%mm6, %%mm"#R1" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
66 "punpcklbw %%mm0, %%mm"#R0" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
67 "movd (%0,%2), %%mm"#R3" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
68 "psubw %%mm"#R0", %%mm"#R1" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
69 "punpcklbw %%mm0, %%mm"#R3" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
70 "paddw %%mm7, %%mm"#R1" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
71 "psubw %%mm"#R3", %%mm"#R1" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
72 "psraw %4, %%mm"#R1" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
73 "movq %%mm"#R1", "#OFF"(%1) \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
74 "add %2, %0 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
75
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
76 DECLARE_ALIGNED_16(const uint64_t, ff_pw_9) = 0x0009000900090009ULL;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
77
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
78 /** Sacrifying mm6 allows to pipeline loads from src */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
79 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
80 const uint8_t *src, x86_reg stride,
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
81 int rnd, int64_t shift)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
82 {
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
83 __asm__ volatile(
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
84 "mov $3, %%"REG_c" \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
85 LOAD_ROUNDER_MMX("%5")
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
86 "movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
87 "1: \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
88 "movd (%0), %%mm2 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
89 "add %2, %0 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
90 "movd (%0), %%mm3 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
91 "punpcklbw %%mm0, %%mm2 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
92 "punpcklbw %%mm0, %%mm3 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
93 SHIFT2_LINE( 0, 1, 2, 3, 4)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
94 SHIFT2_LINE( 24, 2, 3, 4, 1)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
95 SHIFT2_LINE( 48, 3, 4, 1, 2)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
96 SHIFT2_LINE( 72, 4, 1, 2, 3)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
97 SHIFT2_LINE( 96, 1, 2, 3, 4)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
98 SHIFT2_LINE(120, 2, 3, 4, 1)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
99 SHIFT2_LINE(144, 3, 4, 1, 2)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
100 SHIFT2_LINE(168, 4, 1, 2, 3)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
101 "sub %6, %0 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
102 "add $8, %1 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
103 "dec %%"REG_c" \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
104 "jnz 1b \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
105 : "+r"(src), "+r"(dst)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
106 : "r"(stride), "r"(-2*stride),
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
107 "m"(shift), "m"(rnd), "r"(9*stride-4)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
108 : "%"REG_c, "memory"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
109 );
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
110 }
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
111
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
112 /**
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
113 * Data is already unpacked, so some operations can directly be made from
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
114 * memory.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
115 */
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
116 #define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
117 static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
118 const int16_t *src, int rnd)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
119 {\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
120 int h = 8;\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
121 \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
122 src -= 1;\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
123 rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
124 __asm__ volatile(\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
125 LOAD_ROUNDER_MMX("%4")\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
126 "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
127 "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
128 "1: \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
129 "movq 2*0+0(%1), %%mm1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
130 "movq 2*0+8(%1), %%mm2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
131 "movq 2*1+0(%1), %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
132 "movq 2*1+8(%1), %%mm4 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
133 "paddw 2*3+0(%1), %%mm1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
134 "paddw 2*3+8(%1), %%mm2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
135 "paddw 2*2+0(%1), %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
136 "paddw 2*2+8(%1), %%mm4 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
137 "pmullw %%mm5, %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
138 "pmullw %%mm5, %%mm4 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
139 "psubw %%mm1, %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
140 "psubw %%mm2, %%mm4 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
141 NORMALIZE_MMX("$7")\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
142 /* Remove bias */\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
143 "paddw %%mm6, %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
144 "paddw %%mm6, %%mm4 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
145 TRANSFER_DO_PACK(OP)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
146 "add $24, %1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
147 "add %3, %2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
148 "decl %0 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
149 "jnz 1b \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
150 : "+r"(h), "+r" (src), "+r" (dst)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
151 : "r"(stride), "m"(rnd)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
152 : "memory"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
153 );\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
154 }
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
155
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
156 VC1_HOR_16b_SHIFT2(OP_PUT, put_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
157 VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
158
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
159
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
160 /**
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
161 * Purely vertical or horizontal 1/2 shift interpolation.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
162 * Sacrify mm6 for *9 factor.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
163 */
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
164 #define VC1_SHIFT2(OP, OPNAME)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
165 static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
166 x86_reg stride, int rnd, x86_reg offset)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
167 {\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
168 rnd = 8-rnd;\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
169 __asm__ volatile(\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
170 "mov $8, %%"REG_c" \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
171 LOAD_ROUNDER_MMX("%5")\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
172 "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
173 "1: \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
174 "movd 0(%0 ), %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
175 "movd 4(%0 ), %%mm4 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
176 "movd 0(%0,%2), %%mm1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
177 "movd 4(%0,%2), %%mm2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
178 "add %2, %0 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
179 "punpcklbw %%mm0, %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
180 "punpcklbw %%mm0, %%mm4 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
181 "punpcklbw %%mm0, %%mm1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
182 "punpcklbw %%mm0, %%mm2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
183 "paddw %%mm1, %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
184 "paddw %%mm2, %%mm4 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
185 "movd 0(%0,%3), %%mm1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
186 "movd 4(%0,%3), %%mm2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
187 "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
188 "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
189 "punpcklbw %%mm0, %%mm1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
190 "punpcklbw %%mm0, %%mm2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
191 "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
192 "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
193 "movd 0(%0,%2), %%mm1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
194 "movd 4(%0,%2), %%mm2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
195 "punpcklbw %%mm0, %%mm1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
196 "punpcklbw %%mm0, %%mm2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
197 "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
198 "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
199 NORMALIZE_MMX("$4")\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
200 "packuswb %%mm4, %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
201 OP((%1), %%mm3)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
202 "movq %%mm3, (%1) \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
203 "add %6, %0 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
204 "add %4, %1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
205 "dec %%"REG_c" \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
206 "jnz 1b \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
207 : "+r"(src), "+r"(dst)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
208 : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
209 "g"(stride-offset)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
210 : "%"REG_c, "memory"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
211 );\
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
212 }
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
213
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
214 VC1_SHIFT2(OP_PUT, put_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
215 VC1_SHIFT2(OP_AVG, avg_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
216
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
217 /**
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
218 * Filter coefficients made global to allow access by all 1 or 3 quarter shift
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
219 * interpolation functions.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
220 */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
221 DECLARE_ASM_CONST(16, uint64_t, ff_pw_53) = 0x0035003500350035ULL;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
222 DECLARE_ASM_CONST(16, uint64_t, ff_pw_18) = 0x0012001200120012ULL;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
223
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
224 /**
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
225 * Core of the 1/4 and 3/4 shift bicubic interpolation.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
226 *
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
227 * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
228 * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
229 * @param A1 Address of 1st tap (beware of unpacked/packed).
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
230 * @param A2 Address of 2nd tap
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
231 * @param A3 Address of 3rd tap
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
232 * @param A4 Address of 4th tap
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
233 */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
234 #define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
235 MOVQ "*0+"A1", %%mm1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
236 MOVQ "*4+"A1", %%mm2 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
237 UNPACK("%%mm1") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
238 UNPACK("%%mm2") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
239 "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
240 "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
241 MOVQ "*0+"A2", %%mm3 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
242 MOVQ "*4+"A2", %%mm4 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
243 UNPACK("%%mm3") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
244 UNPACK("%%mm4") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
245 "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
246 "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
247 "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
248 "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
249 MOVQ "*0+"A4", %%mm1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
250 MOVQ "*4+"A4", %%mm2 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
251 UNPACK("%%mm1") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
252 UNPACK("%%mm2") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
253 "psllw $2, %%mm1 \n\t" /* 4* */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
254 "psllw $2, %%mm2 \n\t" /* 4* */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
255 "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
256 "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
257 MOVQ "*0+"A3", %%mm1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
258 MOVQ "*4+"A3", %%mm2 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
259 UNPACK("%%mm1") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
260 UNPACK("%%mm2") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
261 "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
262 "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
263 "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
264 "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
265
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
266 /**
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
267 * Macro to build the vertical 16bits version of vc1_put_shift[13].
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
268 * Here, offset=src_stride. Parameters passed A1 to A4 must use
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
269 * %3 (src_stride) and %4 (3*src_stride).
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
270 *
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
271 * @param NAME Either 1 or 3
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
272 * @see MSPEL_FILTER13_CORE for information on A1->A4
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
273 */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
274 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
275 static void \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
276 vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
277 x86_reg src_stride, \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
278 int rnd, int64_t shift) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
279 { \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
280 int h = 8; \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
281 src -= src_stride; \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
282 __asm__ volatile( \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
283 LOAD_ROUNDER_MMX("%5") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
284 "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
285 "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
286 ASMALIGN(3) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
287 "1: \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
288 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
289 NORMALIZE_MMX("%6") \
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
290 TRANSFER_DONT_PACK(OP_PUT) \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
291 /* Last 3 (in fact 4) bytes on the line */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
292 "movd 8+"A1", %%mm1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
293 DO_UNPACK("%%mm1") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
294 "movq %%mm1, %%mm3 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
295 "paddw %%mm1, %%mm1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
296 "paddw %%mm3, %%mm1 \n\t" /* 3* */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
297 "movd 8+"A2", %%mm3 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
298 DO_UNPACK("%%mm3") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
299 "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
300 "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
301 "movd 8+"A3", %%mm1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
302 DO_UNPACK("%%mm1") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
303 "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
304 "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
305 "movd 8+"A4", %%mm1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
306 DO_UNPACK("%%mm1") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
307 "psllw $2, %%mm1 \n\t" /* 4* */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
308 "psubw %%mm1, %%mm3 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
309 "paddw %%mm7, %%mm3 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
310 "psraw %6, %%mm3 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
311 "movq %%mm3, 16(%2) \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
312 "add %3, %1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
313 "add $24, %2 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
314 "decl %0 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
315 "jnz 1b \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
316 : "+r"(h), "+r" (src), "+r" (dst) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
317 : "r"(src_stride), "r"(3*src_stride), \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
318 "m"(rnd), "m"(shift) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
319 : "memory" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
320 ); \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
321 }
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
322
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
323 /**
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
324 * Macro to build the horizontal 16bits version of vc1_put_shift[13].
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
325 * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
326 *
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
327 * @param NAME Either 1 or 3
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
328 * @see MSPEL_FILTER13_CORE for information on A1->A4
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
329 */
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
330 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
331 static void \
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
332 OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
333 const int16_t *src, int rnd) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
334 { \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
335 int h = 8; \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
336 src -= 1; \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
337 rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
338 __asm__ volatile( \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
339 LOAD_ROUNDER_MMX("%4") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
340 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
341 "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
342 ASMALIGN(3) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
343 "1: \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
344 MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
345 NORMALIZE_MMX("$7") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
346 /* Remove bias */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
347 "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
348 "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
349 TRANSFER_DO_PACK(OP) \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
350 "add $24, %1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
351 "add %3, %2 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
352 "decl %0 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
353 "jnz 1b \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
354 : "+r"(h), "+r" (src), "+r" (dst) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
355 : "r"(stride), "m"(rnd) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
356 : "memory" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
357 ); \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
358 }
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
359
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
360 /**
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
361 * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
362 * Here, offset=src_stride. Parameters passed A1 to A4 must use
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
363 * %3 (offset) and %4 (3*offset).
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
364 *
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
365 * @param NAME Either 1 or 3
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
366 * @see MSPEL_FILTER13_CORE for information on A1->A4
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
367 */
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
368 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
369 static void \
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
370 OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
371 x86_reg stride, int rnd, x86_reg offset) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
372 { \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
373 int h = 8; \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
374 src -= offset; \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
375 rnd = 32-rnd; \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
376 __asm__ volatile ( \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
377 LOAD_ROUNDER_MMX("%6") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
378 "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
379 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
380 ASMALIGN(3) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
381 "1: \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
382 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
383 NORMALIZE_MMX("$6") \
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
384 TRANSFER_DO_PACK(OP) \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
385 "add %5, %1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
386 "add %5, %2 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
387 "decl %0 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
388 "jnz 1b \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
389 : "+r"(h), "+r" (src), "+r" (dst) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
390 : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
391 : "memory" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
392 ); \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
393 }
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
394
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
395 /** 1/4 shift bicubic interpolation */
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
396 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
397 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_)
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
398 MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
399 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
400 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
401
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
402 /** 3/4 shift bicubic interpolation */
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
403 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
404 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_)
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
405 MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
406 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
407 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
408
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
409 typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
410 typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
411 typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
412
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
413 /**
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
414 * Interpolates fractional pel values by applying proper vertical then
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
415 * horizontal filter.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
416 *
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
417 * @param dst Destination buffer for interpolated pels.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
418 * @param src Source buffer.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
419 * @param stride Stride for both src and dst buffers.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
420 * @param hmode Horizontal filter (expressed in quarter pixels shift).
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
421 * @param hmode Vertical filter.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
422 * @param rnd Rounding bias.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
423 */
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
424 #define VC1_MSPEL_MC(OP)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
425 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
426 int hmode, int vmode, int rnd)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
427 {\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
428 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
429 { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
430 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
431 { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
432 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
433 { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
434 \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
435 __asm__ volatile(\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
436 "pxor %%mm0, %%mm0 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
437 ::: "memory"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
438 );\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
439 \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
440 if (vmode) { /* Vertical filter to apply */\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
441 if (hmode) { /* Horizontal filter to apply, output to tmp */\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
442 static const int shift_value[] = { 0, 5, 1, 5 };\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
443 int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
444 int r;\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
445 DECLARE_ALIGNED_16(int16_t, tmp[12*8]);\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
446 \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
447 r = (1<<(shift-1)) + rnd-1;\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
448 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
449 \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
450 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
451 return;\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
452 }\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
453 else { /* No horizontal filter, output 8 lines to dst */\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
454 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
455 return;\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
456 }\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
457 }\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
458 \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
459 /* Horizontal mode with no vertical mode */\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
460 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
461 }
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
462
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
463 VC1_MSPEL_MC(put_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
464 VC1_MSPEL_MC(avg_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
465
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
466 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd);
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
467 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd);
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
468
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
469 /** Macro to ease bicubic filter interpolation functions declarations */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
470 #define DECLARE_FUNCTION(a, b) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
471 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
472 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
473 }\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
474 static void avg_vc1_mspel_mc ## a ## b ## _mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
475 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
476 }
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
477
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
478 DECLARE_FUNCTION(0, 1)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
479 DECLARE_FUNCTION(0, 2)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
480 DECLARE_FUNCTION(0, 3)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
481
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
482 DECLARE_FUNCTION(1, 0)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
483 DECLARE_FUNCTION(1, 1)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
484 DECLARE_FUNCTION(1, 2)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
485 DECLARE_FUNCTION(1, 3)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
486
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
487 DECLARE_FUNCTION(2, 0)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
488 DECLARE_FUNCTION(2, 1)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
489 DECLARE_FUNCTION(2, 2)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
490 DECLARE_FUNCTION(2, 3)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
491
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
492 DECLARE_FUNCTION(3, 0)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
493 DECLARE_FUNCTION(3, 1)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
494 DECLARE_FUNCTION(3, 2)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
495 DECLARE_FUNCTION(3, 3)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
496
9859
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
497 static void vc1_inv_trans_4x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
498 {
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
499 int dc = block[0];
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
500 dc = (17 * dc + 4) >> 3;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
501 dc = (17 * dc + 64) >> 7;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
502 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
503 "movd %0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
504 "pshufw $0, %%mm0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
505 "pxor %%mm1, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
506 "psubw %%mm0, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
507 "packuswb %%mm0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
508 "packuswb %%mm1, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
509 ::"r"(dc)
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
510 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
511 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
512 "movd %0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
513 "movd %1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
514 "movd %2, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
515 "movd %3, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
516 "paddusb %%mm0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
517 "paddusb %%mm0, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
518 "paddusb %%mm0, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
519 "paddusb %%mm0, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
520 "psubusb %%mm1, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
521 "psubusb %%mm1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
522 "psubusb %%mm1, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
523 "psubusb %%mm1, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
524 "movd %%mm2, %0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
525 "movd %%mm3, %1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
526 "movd %%mm4, %2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
527 "movd %%mm5, %3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
528 :"+m"(*(uint32_t*)(dest+0*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
529 "+m"(*(uint32_t*)(dest+1*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
530 "+m"(*(uint32_t*)(dest+2*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
531 "+m"(*(uint32_t*)(dest+3*linesize))
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
532 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
533 }
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
534
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
535 static void vc1_inv_trans_4x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
536 {
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
537 int dc = block[0];
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
538 dc = (17 * dc + 4) >> 3;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
539 dc = (12 * dc + 64) >> 7;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
540 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
541 "movd %0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
542 "pshufw $0, %%mm0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
543 "pxor %%mm1, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
544 "psubw %%mm0, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
545 "packuswb %%mm0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
546 "packuswb %%mm1, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
547 ::"r"(dc)
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
548 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
549 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
550 "movd %0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
551 "movd %1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
552 "movd %2, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
553 "movd %3, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
554 "paddusb %%mm0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
555 "paddusb %%mm0, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
556 "paddusb %%mm0, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
557 "paddusb %%mm0, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
558 "psubusb %%mm1, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
559 "psubusb %%mm1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
560 "psubusb %%mm1, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
561 "psubusb %%mm1, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
562 "movd %%mm2, %0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
563 "movd %%mm3, %1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
564 "movd %%mm4, %2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
565 "movd %%mm5, %3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
566 :"+m"(*(uint32_t*)(dest+0*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
567 "+m"(*(uint32_t*)(dest+1*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
568 "+m"(*(uint32_t*)(dest+2*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
569 "+m"(*(uint32_t*)(dest+3*linesize))
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
570 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
571 dest += 4*linesize;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
572 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
573 "movd %0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
574 "movd %1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
575 "movd %2, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
576 "movd %3, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
577 "paddusb %%mm0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
578 "paddusb %%mm0, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
579 "paddusb %%mm0, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
580 "paddusb %%mm0, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
581 "psubusb %%mm1, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
582 "psubusb %%mm1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
583 "psubusb %%mm1, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
584 "psubusb %%mm1, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
585 "movd %%mm2, %0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
586 "movd %%mm3, %1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
587 "movd %%mm4, %2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
588 "movd %%mm5, %3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
589 :"+m"(*(uint32_t*)(dest+0*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
590 "+m"(*(uint32_t*)(dest+1*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
591 "+m"(*(uint32_t*)(dest+2*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
592 "+m"(*(uint32_t*)(dest+3*linesize))
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
593 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
594 }
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
595
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
596 static void vc1_inv_trans_8x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
597 {
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
598 int dc = block[0];
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
599 dc = ( 3 * dc + 1) >> 1;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
600 dc = (17 * dc + 64) >> 7;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
601 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
602 "movd %0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
603 "pshufw $0, %%mm0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
604 "pxor %%mm1, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
605 "psubw %%mm0, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
606 "packuswb %%mm0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
607 "packuswb %%mm1, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
608 ::"r"(dc)
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
609 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
610 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
611 "movq %0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
612 "movq %1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
613 "movq %2, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
614 "movq %3, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
615 "paddusb %%mm0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
616 "paddusb %%mm0, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
617 "paddusb %%mm0, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
618 "paddusb %%mm0, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
619 "psubusb %%mm1, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
620 "psubusb %%mm1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
621 "psubusb %%mm1, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
622 "psubusb %%mm1, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
623 "movq %%mm2, %0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
624 "movq %%mm3, %1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
625 "movq %%mm4, %2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
626 "movq %%mm5, %3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
627 :"+m"(*(uint32_t*)(dest+0*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
628 "+m"(*(uint32_t*)(dest+1*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
629 "+m"(*(uint32_t*)(dest+2*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
630 "+m"(*(uint32_t*)(dest+3*linesize))
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
631 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
632 }
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
633
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
634 static void vc1_inv_trans_8x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
635 {
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
636 int dc = block[0];
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
637 dc = (3 * dc + 1) >> 1;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
638 dc = (3 * dc + 16) >> 5;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
639 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
640 "movd %0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
641 "pshufw $0, %%mm0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
642 "pxor %%mm1, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
643 "psubw %%mm0, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
644 "packuswb %%mm0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
645 "packuswb %%mm1, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
646 ::"r"(dc)
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
647 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
648 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
649 "movq %0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
650 "movq %1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
651 "movq %2, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
652 "movq %3, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
653 "paddusb %%mm0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
654 "paddusb %%mm0, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
655 "paddusb %%mm0, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
656 "paddusb %%mm0, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
657 "psubusb %%mm1, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
658 "psubusb %%mm1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
659 "psubusb %%mm1, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
660 "psubusb %%mm1, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
661 "movq %%mm2, %0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
662 "movq %%mm3, %1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
663 "movq %%mm4, %2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
664 "movq %%mm5, %3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
665 :"+m"(*(uint32_t*)(dest+0*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
666 "+m"(*(uint32_t*)(dest+1*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
667 "+m"(*(uint32_t*)(dest+2*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
668 "+m"(*(uint32_t*)(dest+3*linesize))
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
669 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
670 dest += 4*linesize;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
671 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
672 "movq %0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
673 "movq %1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
674 "movq %2, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
675 "movq %3, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
676 "paddusb %%mm0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
677 "paddusb %%mm0, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
678 "paddusb %%mm0, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
679 "paddusb %%mm0, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
680 "psubusb %%mm1, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
681 "psubusb %%mm1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
682 "psubusb %%mm1, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
683 "psubusb %%mm1, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
684 "movq %%mm2, %0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
685 "movq %%mm3, %1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
686 "movq %%mm4, %2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
687 "movq %%mm5, %3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
688 :"+m"(*(uint32_t*)(dest+0*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
689 "+m"(*(uint32_t*)(dest+1*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
690 "+m"(*(uint32_t*)(dest+2*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
691 "+m"(*(uint32_t*)(dest+3*linesize))
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
692 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
693 }
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
694
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
695 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) {
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
696 mm_flags = mm_support();
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
697
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
698 dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
699 dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
700 dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
701 dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
702
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
703 dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
704 dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
705 dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
706 dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
707
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
708 dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
709 dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
710 dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
711 dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
712
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
713 dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
714 dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
715 dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
716 dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
717
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
718 if (mm_flags & FF_MM_MMX2){
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
719 dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
720 dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
721 dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
722 dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
723
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
724 dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
725 dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
726 dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
727 dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
728
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
729 dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
730 dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
731 dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
732 dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
733
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
734 dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
735 dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
736 dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
737 dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2;
9859
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
738
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
739 dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmx2;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
740 dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
741 dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
742 dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2;
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
743 }
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
744 }