annotate x86/vc1dsp_mmx.c @ 12454:f4355cd85faa libavcodec

Port latest x264 deblock asm (before they moved to using NV12 as internal format), LGPL'ed with permission from Jason and Loren. This includes mmx2 code, so remove inline asm from h264dsp_mmx.c accordingly.
author rbultje
date Fri, 03 Sep 2010 16:52:46 +0000
parents 3fc4c625b6f3
children a5ddb39627fd
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
1 /*
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
2 * VC-1 and WMV3 - DSP functions MMX-optimized
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
3 * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
4 *
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
5 * Permission is hereby granted, free of charge, to any person
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
6 * obtaining a copy of this software and associated documentation
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
7 * files (the "Software"), to deal in the Software without
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
8 * restriction, including without limitation the rights to use,
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
9 * copy, modify, merge, publish, distribute, sublicense, and/or sell
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
10 * copies of the Software, and to permit persons to whom the
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
11 * Software is furnished to do so, subject to the following
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
12 * conditions:
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
13 *
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
14 * The above copyright notice and this permission notice shall be
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
15 * included in all copies or substantial portions of the Software.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
16 *
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
24 * OTHER DEALINGS IN THE SOFTWARE.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
25 */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
26
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
27 #include "libavutil/x86_cpu.h"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
28 #include "libavcodec/dsputil.h"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
29 #include "dsputil_mmx.h"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
30
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
31 #define OP_PUT(S,D)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
32 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
33
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
34 /** Add rounder from mm7 to mm3 and pack result at destination */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
35 #define NORMALIZE_MMX(SHIFT) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
36 "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
37 "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
38 "psraw "SHIFT", %%mm3 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
39 "psraw "SHIFT", %%mm4 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
40
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
41 #define TRANSFER_DO_PACK(OP) \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
42 "packuswb %%mm4, %%mm3 \n\t" \
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
43 OP((%2), %%mm3) \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
44 "movq %%mm3, (%2) \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
45
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
46 #define TRANSFER_DONT_PACK(OP) \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
47 OP(0(%2), %%mm3) \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
48 OP(8(%2), %%mm4) \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
49 "movq %%mm3, 0(%2) \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
50 "movq %%mm4, 8(%2) \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
51
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
52 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
53 #define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
54 #define DONT_UNPACK(reg)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
55
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
56 /** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
57 #define LOAD_ROUNDER_MMX(ROUND) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
58 "movd "ROUND", %%mm7 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
59 "punpcklwd %%mm7, %%mm7 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
60 "punpckldq %%mm7, %%mm7 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
61
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
62 #define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
63 "paddw %%mm"#R2", %%mm"#R1" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
64 "movd (%0,%3), %%mm"#R0" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
65 "pmullw %%mm6, %%mm"#R1" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
66 "punpcklbw %%mm0, %%mm"#R0" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
67 "movd (%0,%2), %%mm"#R3" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
68 "psubw %%mm"#R0", %%mm"#R1" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
69 "punpcklbw %%mm0, %%mm"#R3" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
70 "paddw %%mm7, %%mm"#R1" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
71 "psubw %%mm"#R3", %%mm"#R1" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
72 "psraw %4, %%mm"#R1" \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
73 "movq %%mm"#R1", "#OFF"(%1) \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
74 "add %2, %0 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
75
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
76 /** Sacrifying mm6 allows to pipeline loads from src */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
77 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
78 const uint8_t *src, x86_reg stride,
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
79 int rnd, int64_t shift)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
80 {
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
81 __asm__ volatile(
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
82 "mov $3, %%"REG_c" \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
83 LOAD_ROUNDER_MMX("%5")
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
84 "movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
85 "1: \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
86 "movd (%0), %%mm2 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
87 "add %2, %0 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
88 "movd (%0), %%mm3 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
89 "punpcklbw %%mm0, %%mm2 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
90 "punpcklbw %%mm0, %%mm3 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
91 SHIFT2_LINE( 0, 1, 2, 3, 4)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
92 SHIFT2_LINE( 24, 2, 3, 4, 1)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
93 SHIFT2_LINE( 48, 3, 4, 1, 2)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
94 SHIFT2_LINE( 72, 4, 1, 2, 3)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
95 SHIFT2_LINE( 96, 1, 2, 3, 4)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
96 SHIFT2_LINE(120, 2, 3, 4, 1)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
97 SHIFT2_LINE(144, 3, 4, 1, 2)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
98 SHIFT2_LINE(168, 4, 1, 2, 3)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
99 "sub %6, %0 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
100 "add $8, %1 \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
101 "dec %%"REG_c" \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
102 "jnz 1b \n\t"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
103 : "+r"(src), "+r"(dst)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
104 : "r"(stride), "r"(-2*stride),
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
105 "m"(shift), "m"(rnd), "r"(9*stride-4)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
106 : "%"REG_c, "memory"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
107 );
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
108 }
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
109
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
110 /**
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
111 * Data is already unpacked, so some operations can directly be made from
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
112 * memory.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
113 */
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
114 #define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
115 static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
116 const int16_t *src, int rnd)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
117 {\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
118 int h = 8;\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
119 \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
120 src -= 1;\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
121 rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
122 __asm__ volatile(\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
123 LOAD_ROUNDER_MMX("%4")\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
124 "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
125 "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
126 "1: \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
127 "movq 2*0+0(%1), %%mm1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
128 "movq 2*0+8(%1), %%mm2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
129 "movq 2*1+0(%1), %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
130 "movq 2*1+8(%1), %%mm4 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
131 "paddw 2*3+0(%1), %%mm1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
132 "paddw 2*3+8(%1), %%mm2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
133 "paddw 2*2+0(%1), %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
134 "paddw 2*2+8(%1), %%mm4 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
135 "pmullw %%mm5, %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
136 "pmullw %%mm5, %%mm4 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
137 "psubw %%mm1, %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
138 "psubw %%mm2, %%mm4 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
139 NORMALIZE_MMX("$7")\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
140 /* Remove bias */\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
141 "paddw %%mm6, %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
142 "paddw %%mm6, %%mm4 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
143 TRANSFER_DO_PACK(OP)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
144 "add $24, %1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
145 "add %3, %2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
146 "decl %0 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
147 "jnz 1b \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
148 : "+r"(h), "+r" (src), "+r" (dst)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
149 : "r"(stride), "m"(rnd)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
150 : "memory"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
151 );\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
152 }
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
153
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
154 VC1_HOR_16b_SHIFT2(OP_PUT, put_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
155 VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
156
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
157
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
158 /**
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
159 * Purely vertical or horizontal 1/2 shift interpolation.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
160 * Sacrify mm6 for *9 factor.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
161 */
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
162 #define VC1_SHIFT2(OP, OPNAME)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
163 static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
164 x86_reg stride, int rnd, x86_reg offset)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
165 {\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
166 rnd = 8-rnd;\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
167 __asm__ volatile(\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
168 "mov $8, %%"REG_c" \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
169 LOAD_ROUNDER_MMX("%5")\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
170 "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
171 "1: \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
172 "movd 0(%0 ), %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
173 "movd 4(%0 ), %%mm4 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
174 "movd 0(%0,%2), %%mm1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
175 "movd 4(%0,%2), %%mm2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
176 "add %2, %0 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
177 "punpcklbw %%mm0, %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
178 "punpcklbw %%mm0, %%mm4 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
179 "punpcklbw %%mm0, %%mm1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
180 "punpcklbw %%mm0, %%mm2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
181 "paddw %%mm1, %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
182 "paddw %%mm2, %%mm4 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
183 "movd 0(%0,%3), %%mm1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
184 "movd 4(%0,%3), %%mm2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
185 "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
186 "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
187 "punpcklbw %%mm0, %%mm1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
188 "punpcklbw %%mm0, %%mm2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
189 "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
190 "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
191 "movd 0(%0,%2), %%mm1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
192 "movd 4(%0,%2), %%mm2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
193 "punpcklbw %%mm0, %%mm1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
194 "punpcklbw %%mm0, %%mm2 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
195 "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
196 "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
197 NORMALIZE_MMX("$4")\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
198 "packuswb %%mm4, %%mm3 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
199 OP((%1), %%mm3)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
200 "movq %%mm3, (%1) \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
201 "add %6, %0 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
202 "add %4, %1 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
203 "dec %%"REG_c" \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
204 "jnz 1b \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
205 : "+r"(src), "+r"(dst)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
206 : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
207 "g"(stride-offset)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
208 : "%"REG_c, "memory"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
209 );\
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
210 }
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
211
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
212 VC1_SHIFT2(OP_PUT, put_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
213 VC1_SHIFT2(OP_AVG, avg_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
214
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
215 /**
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
216 * Core of the 1/4 and 3/4 shift bicubic interpolation.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
217 *
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
218 * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
219 * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
220 * @param A1 Address of 1st tap (beware of unpacked/packed).
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
221 * @param A2 Address of 2nd tap
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
222 * @param A3 Address of 3rd tap
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
223 * @param A4 Address of 4th tap
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
224 */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
225 #define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
226 MOVQ "*0+"A1", %%mm1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
227 MOVQ "*4+"A1", %%mm2 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
228 UNPACK("%%mm1") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
229 UNPACK("%%mm2") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
230 "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
231 "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
232 MOVQ "*0+"A2", %%mm3 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
233 MOVQ "*4+"A2", %%mm4 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
234 UNPACK("%%mm3") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
235 UNPACK("%%mm4") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
236 "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
237 "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
238 "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
239 "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
240 MOVQ "*0+"A4", %%mm1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
241 MOVQ "*4+"A4", %%mm2 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
242 UNPACK("%%mm1") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
243 UNPACK("%%mm2") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
244 "psllw $2, %%mm1 \n\t" /* 4* */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
245 "psllw $2, %%mm2 \n\t" /* 4* */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
246 "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
247 "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
248 MOVQ "*0+"A3", %%mm1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
249 MOVQ "*4+"A3", %%mm2 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
250 UNPACK("%%mm1") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
251 UNPACK("%%mm2") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
252 "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
253 "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
254 "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
255 "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
256
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
257 /**
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
258 * Macro to build the vertical 16bits version of vc1_put_shift[13].
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
259 * Here, offset=src_stride. Parameters passed A1 to A4 must use
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
260 * %3 (src_stride) and %4 (3*src_stride).
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
261 *
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
262 * @param NAME Either 1 or 3
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
263 * @see MSPEL_FILTER13_CORE for information on A1->A4
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
264 */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
265 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
266 static void \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
267 vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
268 x86_reg src_stride, \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
269 int rnd, int64_t shift) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
270 { \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
271 int h = 8; \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
272 src -= src_stride; \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
273 __asm__ volatile( \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
274 LOAD_ROUNDER_MMX("%5") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
275 "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
276 "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
277 ASMALIGN(3) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
278 "1: \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
279 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
280 NORMALIZE_MMX("%6") \
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
281 TRANSFER_DONT_PACK(OP_PUT) \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
282 /* Last 3 (in fact 4) bytes on the line */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
283 "movd 8+"A1", %%mm1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
284 DO_UNPACK("%%mm1") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
285 "movq %%mm1, %%mm3 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
286 "paddw %%mm1, %%mm1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
287 "paddw %%mm3, %%mm1 \n\t" /* 3* */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
288 "movd 8+"A2", %%mm3 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
289 DO_UNPACK("%%mm3") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
290 "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
291 "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
292 "movd 8+"A3", %%mm1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
293 DO_UNPACK("%%mm1") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
294 "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
295 "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
296 "movd 8+"A4", %%mm1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
297 DO_UNPACK("%%mm1") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
298 "psllw $2, %%mm1 \n\t" /* 4* */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
299 "psubw %%mm1, %%mm3 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
300 "paddw %%mm7, %%mm3 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
301 "psraw %6, %%mm3 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
302 "movq %%mm3, 16(%2) \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
303 "add %3, %1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
304 "add $24, %2 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
305 "decl %0 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
306 "jnz 1b \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
307 : "+r"(h), "+r" (src), "+r" (dst) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
308 : "r"(src_stride), "r"(3*src_stride), \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
309 "m"(rnd), "m"(shift) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
310 : "memory" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
311 ); \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
312 }
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
313
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
314 /**
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
315 * Macro to build the horizontal 16bits version of vc1_put_shift[13].
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
316 * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
317 *
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
318 * @param NAME Either 1 or 3
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
319 * @see MSPEL_FILTER13_CORE for information on A1->A4
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
320 */
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
321 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
322 static void \
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
323 OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
324 const int16_t *src, int rnd) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
325 { \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
326 int h = 8; \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
327 src -= 1; \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
328 rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
329 __asm__ volatile( \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
330 LOAD_ROUNDER_MMX("%4") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
331 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
332 "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
333 ASMALIGN(3) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
334 "1: \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
335 MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
336 NORMALIZE_MMX("$7") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
337 /* Remove bias */ \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
338 "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
339 "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
340 TRANSFER_DO_PACK(OP) \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
341 "add $24, %1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
342 "add %3, %2 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
343 "decl %0 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
344 "jnz 1b \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
345 : "+r"(h), "+r" (src), "+r" (dst) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
346 : "r"(stride), "m"(rnd) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
347 : "memory" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
348 ); \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
349 }
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
350
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
351 /**
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
352 * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
353 * Here, offset=src_stride. Parameters passed A1 to A4 must use
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
354 * %3 (offset) and %4 (3*offset).
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
355 *
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
356 * @param NAME Either 1 or 3
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
357 * @see MSPEL_FILTER13_CORE for information on A1->A4
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
358 */
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
359 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
360 static void \
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
361 OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
362 x86_reg stride, int rnd, x86_reg offset) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
363 { \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
364 int h = 8; \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
365 src -= offset; \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
366 rnd = 32-rnd; \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
367 __asm__ volatile ( \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
368 LOAD_ROUNDER_MMX("%6") \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
369 "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
370 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
371 ASMALIGN(3) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
372 "1: \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
373 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
374 NORMALIZE_MMX("$6") \
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
375 TRANSFER_DO_PACK(OP) \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
376 "add %5, %1 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
377 "add %5, %2 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
378 "decl %0 \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
379 "jnz 1b \n\t" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
380 : "+r"(h), "+r" (src), "+r" (dst) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
381 : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
382 : "memory" \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
383 ); \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
384 }
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
385
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
386 /** 1/4 shift bicubic interpolation */
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
387 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
388 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_)
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
389 MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
390 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
391 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
392
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
393 /** 3/4 shift bicubic interpolation */
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
394 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
395 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_)
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
396 MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
397 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
398 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
399
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
400 typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
401 typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
402 typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
403
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
404 /**
12024
fdafbcef52f5 Fix grammar errors in documentation
mru
parents: 11381
diff changeset
405 * Interpolate fractional pel values by applying proper vertical then
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
406 * horizontal filter.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
407 *
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
408 * @param dst Destination buffer for interpolated pels.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
409 * @param src Source buffer.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
410 * @param stride Stride for both src and dst buffers.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
411 * @param hmode Horizontal filter (expressed in quarter pixels shift).
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
412 * @param hmode Vertical filter.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
413 * @param rnd Rounding bias.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
414 */
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
415 #define VC1_MSPEL_MC(OP)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
416 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
417 int hmode, int vmode, int rnd)\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
418 {\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
419 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
420 { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
421 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
422 { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
423 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
424 { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
425 \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
426 __asm__ volatile(\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
427 "pxor %%mm0, %%mm0 \n\t"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
428 ::: "memory"\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
429 );\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
430 \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
431 if (vmode) { /* Vertical filter to apply */\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
432 if (hmode) { /* Horizontal filter to apply, output to tmp */\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
433 static const int shift_value[] = { 0, 5, 1, 5 };\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
434 int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
435 int r;\
11369
98970e51365a Remove DECLARE_ALIGNED_{8,16} macros
mru
parents: 10961
diff changeset
436 DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
437 \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
438 r = (1<<(shift-1)) + rnd-1;\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
439 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
440 \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
441 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
442 return;\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
443 }\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
444 else { /* No horizontal filter, output 8 lines to dst */\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
445 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
446 return;\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
447 }\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
448 }\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
449 \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
450 /* Horizontal mode with no vertical mode */\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
451 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
452 }
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
453
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
454 VC1_MSPEL_MC(put_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
455 VC1_MSPEL_MC(avg_)
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
456
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
457 /** Macro to ease bicubic filter interpolation functions declarations */
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
458 #define DECLARE_FUNCTION(a, b) \
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
459 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
460 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
461 }\
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
462 static void avg_vc1_mspel_mc ## a ## b ## _mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
463 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
464 }
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
465
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
466 DECLARE_FUNCTION(0, 1)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
467 DECLARE_FUNCTION(0, 2)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
468 DECLARE_FUNCTION(0, 3)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
469
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
470 DECLARE_FUNCTION(1, 0)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
471 DECLARE_FUNCTION(1, 1)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
472 DECLARE_FUNCTION(1, 2)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
473 DECLARE_FUNCTION(1, 3)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
474
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
475 DECLARE_FUNCTION(2, 0)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
476 DECLARE_FUNCTION(2, 1)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
477 DECLARE_FUNCTION(2, 2)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
478 DECLARE_FUNCTION(2, 3)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
479
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
480 DECLARE_FUNCTION(3, 0)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
481 DECLARE_FUNCTION(3, 1)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
482 DECLARE_FUNCTION(3, 2)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
483 DECLARE_FUNCTION(3, 3)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
484
9859
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
485 static void vc1_inv_trans_4x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
486 {
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
487 int dc = block[0];
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
488 dc = (17 * dc + 4) >> 3;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
489 dc = (17 * dc + 64) >> 7;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
490 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
491 "movd %0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
492 "pshufw $0, %%mm0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
493 "pxor %%mm1, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
494 "psubw %%mm0, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
495 "packuswb %%mm0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
496 "packuswb %%mm1, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
497 ::"r"(dc)
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
498 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
499 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
500 "movd %0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
501 "movd %1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
502 "movd %2, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
503 "movd %3, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
504 "paddusb %%mm0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
505 "paddusb %%mm0, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
506 "paddusb %%mm0, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
507 "paddusb %%mm0, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
508 "psubusb %%mm1, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
509 "psubusb %%mm1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
510 "psubusb %%mm1, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
511 "psubusb %%mm1, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
512 "movd %%mm2, %0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
513 "movd %%mm3, %1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
514 "movd %%mm4, %2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
515 "movd %%mm5, %3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
516 :"+m"(*(uint32_t*)(dest+0*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
517 "+m"(*(uint32_t*)(dest+1*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
518 "+m"(*(uint32_t*)(dest+2*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
519 "+m"(*(uint32_t*)(dest+3*linesize))
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
520 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
521 }
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
522
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
523 static void vc1_inv_trans_4x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
524 {
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
525 int dc = block[0];
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
526 dc = (17 * dc + 4) >> 3;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
527 dc = (12 * dc + 64) >> 7;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
528 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
529 "movd %0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
530 "pshufw $0, %%mm0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
531 "pxor %%mm1, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
532 "psubw %%mm0, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
533 "packuswb %%mm0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
534 "packuswb %%mm1, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
535 ::"r"(dc)
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
536 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
537 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
538 "movd %0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
539 "movd %1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
540 "movd %2, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
541 "movd %3, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
542 "paddusb %%mm0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
543 "paddusb %%mm0, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
544 "paddusb %%mm0, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
545 "paddusb %%mm0, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
546 "psubusb %%mm1, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
547 "psubusb %%mm1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
548 "psubusb %%mm1, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
549 "psubusb %%mm1, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
550 "movd %%mm2, %0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
551 "movd %%mm3, %1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
552 "movd %%mm4, %2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
553 "movd %%mm5, %3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
554 :"+m"(*(uint32_t*)(dest+0*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
555 "+m"(*(uint32_t*)(dest+1*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
556 "+m"(*(uint32_t*)(dest+2*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
557 "+m"(*(uint32_t*)(dest+3*linesize))
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
558 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
559 dest += 4*linesize;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
560 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
561 "movd %0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
562 "movd %1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
563 "movd %2, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
564 "movd %3, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
565 "paddusb %%mm0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
566 "paddusb %%mm0, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
567 "paddusb %%mm0, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
568 "paddusb %%mm0, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
569 "psubusb %%mm1, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
570 "psubusb %%mm1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
571 "psubusb %%mm1, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
572 "psubusb %%mm1, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
573 "movd %%mm2, %0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
574 "movd %%mm3, %1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
575 "movd %%mm4, %2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
576 "movd %%mm5, %3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
577 :"+m"(*(uint32_t*)(dest+0*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
578 "+m"(*(uint32_t*)(dest+1*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
579 "+m"(*(uint32_t*)(dest+2*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
580 "+m"(*(uint32_t*)(dest+3*linesize))
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
581 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
582 }
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
583
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
584 static void vc1_inv_trans_8x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
585 {
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
586 int dc = block[0];
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
587 dc = ( 3 * dc + 1) >> 1;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
588 dc = (17 * dc + 64) >> 7;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
589 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
590 "movd %0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
591 "pshufw $0, %%mm0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
592 "pxor %%mm1, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
593 "psubw %%mm0, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
594 "packuswb %%mm0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
595 "packuswb %%mm1, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
596 ::"r"(dc)
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
597 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
598 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
599 "movq %0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
600 "movq %1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
601 "movq %2, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
602 "movq %3, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
603 "paddusb %%mm0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
604 "paddusb %%mm0, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
605 "paddusb %%mm0, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
606 "paddusb %%mm0, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
607 "psubusb %%mm1, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
608 "psubusb %%mm1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
609 "psubusb %%mm1, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
610 "psubusb %%mm1, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
611 "movq %%mm2, %0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
612 "movq %%mm3, %1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
613 "movq %%mm4, %2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
614 "movq %%mm5, %3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
615 :"+m"(*(uint32_t*)(dest+0*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
616 "+m"(*(uint32_t*)(dest+1*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
617 "+m"(*(uint32_t*)(dest+2*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
618 "+m"(*(uint32_t*)(dest+3*linesize))
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
619 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
620 }
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
621
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
622 static void vc1_inv_trans_8x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
623 {
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
624 int dc = block[0];
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
625 dc = (3 * dc + 1) >> 1;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
626 dc = (3 * dc + 16) >> 5;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
627 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
628 "movd %0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
629 "pshufw $0, %%mm0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
630 "pxor %%mm1, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
631 "psubw %%mm0, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
632 "packuswb %%mm0, %%mm0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
633 "packuswb %%mm1, %%mm1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
634 ::"r"(dc)
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
635 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
636 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
637 "movq %0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
638 "movq %1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
639 "movq %2, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
640 "movq %3, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
641 "paddusb %%mm0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
642 "paddusb %%mm0, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
643 "paddusb %%mm0, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
644 "paddusb %%mm0, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
645 "psubusb %%mm1, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
646 "psubusb %%mm1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
647 "psubusb %%mm1, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
648 "psubusb %%mm1, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
649 "movq %%mm2, %0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
650 "movq %%mm3, %1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
651 "movq %%mm4, %2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
652 "movq %%mm5, %3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
653 :"+m"(*(uint32_t*)(dest+0*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
654 "+m"(*(uint32_t*)(dest+1*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
655 "+m"(*(uint32_t*)(dest+2*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
656 "+m"(*(uint32_t*)(dest+3*linesize))
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
657 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
658 dest += 4*linesize;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
659 __asm__ volatile(
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
660 "movq %0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
661 "movq %1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
662 "movq %2, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
663 "movq %3, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
664 "paddusb %%mm0, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
665 "paddusb %%mm0, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
666 "paddusb %%mm0, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
667 "paddusb %%mm0, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
668 "psubusb %%mm1, %%mm2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
669 "psubusb %%mm1, %%mm3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
670 "psubusb %%mm1, %%mm4 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
671 "psubusb %%mm1, %%mm5 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
672 "movq %%mm2, %0 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
673 "movq %%mm3, %1 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
674 "movq %%mm4, %2 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
675 "movq %%mm5, %3 \n\t"
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
676 :"+m"(*(uint32_t*)(dest+0*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
677 "+m"(*(uint32_t*)(dest+1*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
678 "+m"(*(uint32_t*)(dest+2*linesize)),
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
679 "+m"(*(uint32_t*)(dest+3*linesize))
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
680 );
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
681 }
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
682
12144
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
683 #define LOOP_FILTER(EXT) \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
684 void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
685 void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
686 void ff_vc1_v_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
687 void ff_vc1_h_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
688 \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
689 static void vc1_v_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
690 { \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
691 ff_vc1_v_loop_filter8_ ## EXT(src, stride, pq); \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
692 ff_vc1_v_loop_filter8_ ## EXT(src+8, stride, pq); \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
693 } \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
694 \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
695 static void vc1_h_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
696 { \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
697 ff_vc1_h_loop_filter8_ ## EXT(src, stride, pq); \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
698 ff_vc1_h_loop_filter8_ ## EXT(src+8*stride, stride, pq); \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
699 }
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
700
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
701 #if HAVE_YASM
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
702 LOOP_FILTER(mmx)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
703 LOOP_FILTER(mmx2)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
704 LOOP_FILTER(sse2)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
705 LOOP_FILTER(ssse3)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
706
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
707 void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq);
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
708
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
709 static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
710 {
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
711 ff_vc1_h_loop_filter8_sse4(src, stride, pq);
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
712 ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
713 }
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
714 #endif
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
715
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
716 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) {
12414
3fc4c625b6f3 Remove global mm_flags variable
mru
parents: 12206
diff changeset
717 int mm_flags = mm_support();
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
718
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
719 dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
720 dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
721 dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
722 dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
723
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
724 dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
725 dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
726 dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
727 dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
728
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
729 dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
730 dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
731 dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
732 dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
733
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
734 dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
735 dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
736 dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
737 dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
738
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
739 if (mm_flags & FF_MM_MMX2){
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
740 dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
741 dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
742 dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
743 dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
744
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
745 dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
746 dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
747 dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
748 dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
749
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
750 dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
751 dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
752 dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
753 dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
754
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
755 dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
756 dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
757 dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2;
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
758 dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2;
9859
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
759
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
760 dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmx2;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
761 dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
762 dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2;
7a116de63777 idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
darkshikari
parents: 9441
diff changeset
763 dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2;
9441
e14cd3ac3806 VC1: extend MMX qpel MC to include MMX2 avg qpel
conrad
parents: 8430
diff changeset
764 }
12144
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
765
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
766 #define ASSIGN_LF(EXT) \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
767 dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_ ## EXT; \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
768 dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_ ## EXT; \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
769 dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_ ## EXT; \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
770 dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_ ## EXT; \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
771 dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_ ## EXT; \
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
772 dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
773
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
774 #if HAVE_YASM
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
775 if (mm_flags & FF_MM_MMX) {
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
776 ASSIGN_LF(mmx);
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
777 }
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
778 return;
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
779 if (mm_flags & FF_MM_MMX2) {
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
780 ASSIGN_LF(mmx2);
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
781 }
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
782 if (mm_flags & FF_MM_SSE2) {
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
783 dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2;
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
784 dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2;
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
785 dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
786 dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
787 }
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
788 if (mm_flags & FF_MM_SSSE3) {
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
789 ASSIGN_LF(ssse3);
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
790 }
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
791 if (mm_flags & FF_MM_SSE4) {
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
792 dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4;
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
793 dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4;
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
794 }
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents: 12024
diff changeset
795 #endif
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
796 }