annotate i386/dsputil_mmx.c @ 1708:dea5b2946999 libavcodec

interlaced motion estimation interlaced mpeg2 encoding P & B frames rate distored interlaced mb decission alternate scantable support 4mv encoding fixes (thats also why the regression tests change) passing height to most dsp functions interlaced mpeg4 encoding (no direct mode MBs yet) various related cleanups disabled old motion estimaton algorithms (log, full, ...) they will either be fixed or removed
author michael
date Tue, 30 Dec 2003 16:07:57 +0000
parents 68abbec33289
children a4a5e7521339
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1 /*
986e461dc072 Initial revision
glantau
parents:
diff changeset
2 * MMX optimized DSP utils
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
4 *
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
5 * This library is free software; you can redistribute it and/or
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
6 * modify it under the terms of the GNU Lesser General Public
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
7 * License as published by the Free Software Foundation; either
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
8 * version 2 of the License, or (at your option) any later version.
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
9 *
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
10 * This library is distributed in the hope that it will be useful,
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
13 * Lesser General Public License for more details.
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
14 *
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
15 * You should have received a copy of the GNU Lesser General Public
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
16 * License along with this library; if not, write to the Free Software
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
18 *
986e461dc072 Initial revision
glantau
parents:
diff changeset
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
986e461dc072 Initial revision
glantau
parents:
diff changeset
20 */
986e461dc072 Initial revision
glantau
parents:
diff changeset
21
986e461dc072 Initial revision
glantau
parents:
diff changeset
22 #include "../dsputil.h"
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
23 #include "../simple_idct.h"
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
24
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
25 extern const uint8_t ff_h263_loop_filter_strength[32];
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
26
5
4479bcab253e suppressed no longer needed emms()
glantau
parents: 0
diff changeset
27 int mm_flags; /* multimedia extension flags */
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
28
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
29 /* pixel operations */
387
b8f3affeb8e1 shared lib support (req by kabi) ...
michaelni
parents: 386
diff changeset
30 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
b8f3affeb8e1 shared lib support (req by kabi) ...
michaelni
parents: 386
diff changeset
31 static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
b8f3affeb8e1 shared lib support (req by kabi) ...
michaelni
parents: 386
diff changeset
32 static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
33
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
34 static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
35 static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
36 static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
37 static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
38
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
39 static const uint64_t ff_pb_FC __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
40
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
41 #define JUMPALIGN() __asm __volatile (".balign 8"::)
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
42 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
43
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
44 #define MOVQ_WONE(regd) \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
45 __asm __volatile ( \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
46 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
47 "psrlw $15, %%" #regd ::)
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
48
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
49 #define MOVQ_BFE(regd) \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
50 __asm __volatile ( \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
51 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
52 "paddb %%" #regd ", %%" #regd " \n\t" ::)
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
53
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
54 #ifndef PIC
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
55 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
56 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
57 #else
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
58 // for shared library it's better to use this way for accessing constants
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
59 // pcmpeqd -> -1
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
60 #define MOVQ_BONE(regd) \
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
61 __asm __volatile ( \
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
62 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
63 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
64 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
65
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
66 #define MOVQ_WTWO(regd) \
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
67 __asm __volatile ( \
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
68 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
69 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
70 "psllw $1, %%" #regd " \n\t"::)
387
b8f3affeb8e1 shared lib support (req by kabi) ...
michaelni
parents: 386
diff changeset
71
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
72 #endif
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
73
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
74 // using regr as temporary and for the output result
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
75 // first argument is unmodifed and second is trashed
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
76 // regfe is supposed to contain 0xfefefefefefefefe
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
77 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
78 "movq " #rega ", " #regr " \n\t"\
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
79 "pand " #regb ", " #regr " \n\t"\
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
80 "pxor " #rega ", " #regb " \n\t"\
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
81 "pand " #regfe "," #regb " \n\t"\
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
82 "psrlq $1, " #regb " \n\t"\
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
83 "paddb " #regb ", " #regr " \n\t"
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
84
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
85 #define PAVGB_MMX(rega, regb, regr, regfe) \
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
86 "movq " #rega ", " #regr " \n\t"\
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
87 "por " #regb ", " #regr " \n\t"\
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
88 "pxor " #rega ", " #regb " \n\t"\
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
89 "pand " #regfe "," #regb " \n\t"\
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
90 "psrlq $1, " #regb " \n\t"\
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
91 "psubb " #regb ", " #regr " \n\t"
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
92
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
93 // mm6 is supposed to contain 0xfefefefefefefefe
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
94 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
95 "movq " #rega ", " #regr " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
96 "movq " #regc ", " #regp " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
97 "pand " #regb ", " #regr " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
98 "pand " #regd ", " #regp " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
99 "pxor " #rega ", " #regb " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
100 "pxor " #regc ", " #regd " \n\t"\
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
101 "pand %%mm6, " #regb " \n\t"\
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
102 "pand %%mm6, " #regd " \n\t"\
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
103 "psrlq $1, " #regb " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
104 "psrlq $1, " #regd " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
105 "paddb " #regb ", " #regr " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
106 "paddb " #regd ", " #regp " \n\t"
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
107
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
108 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
109 "movq " #rega ", " #regr " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
110 "movq " #regc ", " #regp " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
111 "por " #regb ", " #regr " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
112 "por " #regd ", " #regp " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
113 "pxor " #rega ", " #regb " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
114 "pxor " #regc ", " #regd " \n\t"\
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
115 "pand %%mm6, " #regb " \n\t"\
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
116 "pand %%mm6, " #regd " \n\t"\
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
117 "psrlq $1, " #regd " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
118 "psrlq $1, " #regb " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
119 "psubb " #regb ", " #regr " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
120 "psubb " #regd ", " #regp " \n\t"
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
121
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
122 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
123 /* MMX no rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
124 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
125 #define SET_RND MOVQ_WONE
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
126 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
127 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
128
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
129 #include "dsputil_mmx_rnd.h"
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
130
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
131 #undef DEF
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
132 #undef SET_RND
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
133 #undef PAVGBP
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
134 #undef PAVGB
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
135 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
136 /* MMX rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
137
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
138 #define DEF(x, y) x ## _ ## y ##_mmx
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
139 #define SET_RND MOVQ_WTWO
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
140 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
141 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
142
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
143 #include "dsputil_mmx_rnd.h"
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
144
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
145 #undef DEF
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
146 #undef SET_RND
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
147 #undef PAVGBP
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
148 #undef PAVGB
387
b8f3affeb8e1 shared lib support (req by kabi) ...
michaelni
parents: 386
diff changeset
149
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
150 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
151 /* 3Dnow specific */
986e461dc072 Initial revision
glantau
parents:
diff changeset
152
986e461dc072 Initial revision
glantau
parents:
diff changeset
153 #define DEF(x) x ## _3dnow
986e461dc072 Initial revision
glantau
parents:
diff changeset
154 /* for Athlons PAVGUSB is prefered */
986e461dc072 Initial revision
glantau
parents:
diff changeset
155 #define PAVGB "pavgusb"
986e461dc072 Initial revision
glantau
parents:
diff changeset
156
986e461dc072 Initial revision
glantau
parents:
diff changeset
157 #include "dsputil_mmx_avg.h"
986e461dc072 Initial revision
glantau
parents:
diff changeset
158
986e461dc072 Initial revision
glantau
parents:
diff changeset
159 #undef DEF
986e461dc072 Initial revision
glantau
parents:
diff changeset
160 #undef PAVGB
986e461dc072 Initial revision
glantau
parents:
diff changeset
161
986e461dc072 Initial revision
glantau
parents:
diff changeset
162 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
163 /* MMX2 specific */
986e461dc072 Initial revision
glantau
parents:
diff changeset
164
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
165 #define DEF(x) x ## _mmx2
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
166
986e461dc072 Initial revision
glantau
parents:
diff changeset
167 /* Introduced only in MMX2 set */
986e461dc072 Initial revision
glantau
parents:
diff changeset
168 #define PAVGB "pavgb"
986e461dc072 Initial revision
glantau
parents:
diff changeset
169
986e461dc072 Initial revision
glantau
parents:
diff changeset
170 #include "dsputil_mmx_avg.h"
986e461dc072 Initial revision
glantau
parents:
diff changeset
171
986e461dc072 Initial revision
glantau
parents:
diff changeset
172 #undef DEF
986e461dc072 Initial revision
glantau
parents:
diff changeset
173 #undef PAVGB
986e461dc072 Initial revision
glantau
parents:
diff changeset
174
986e461dc072 Initial revision
glantau
parents:
diff changeset
175 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
176 /* standard MMX */
986e461dc072 Initial revision
glantau
parents:
diff changeset
177
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
178 #ifdef CONFIG_ENCODERS
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
179 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
180 {
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
181 asm volatile(
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
182 "movl $-128, %%eax \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
183 "pxor %%mm7, %%mm7 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
184 ".balign 16 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
185 "1: \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
186 "movq (%0), %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
187 "movq (%0, %2), %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
188 "movq %%mm0, %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
189 "movq %%mm2, %%mm3 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
190 "punpcklbw %%mm7, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
191 "punpckhbw %%mm7, %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
192 "punpcklbw %%mm7, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
193 "punpckhbw %%mm7, %%mm3 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
194 "movq %%mm0, (%1, %%eax)\n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
195 "movq %%mm1, 8(%1, %%eax)\n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
196 "movq %%mm2, 16(%1, %%eax)\n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
197 "movq %%mm3, 24(%1, %%eax)\n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
198 "addl %3, %0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
199 "addl $32, %%eax \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
200 "js 1b \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
201 : "+r" (pixels)
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
202 : "r" (block+64), "r" (line_size), "r" (line_size*2)
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
203 : "%eax"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
204 );
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
205 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
206
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
207 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
324
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
208 {
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
209 asm volatile(
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
210 "pxor %%mm7, %%mm7 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
211 "movl $-128, %%eax \n\t"
324
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
212 ".balign 16 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
213 "1: \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
214 "movq (%0), %%mm0 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
215 "movq (%1), %%mm2 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
216 "movq %%mm0, %%mm1 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
217 "movq %%mm2, %%mm3 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
218 "punpcklbw %%mm7, %%mm0 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
219 "punpckhbw %%mm7, %%mm1 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
220 "punpcklbw %%mm7, %%mm2 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
221 "punpckhbw %%mm7, %%mm3 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
222 "psubw %%mm2, %%mm0 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
223 "psubw %%mm3, %%mm1 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
224 "movq %%mm0, (%2, %%eax)\n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
225 "movq %%mm1, 8(%2, %%eax)\n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
226 "addl %3, %0 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
227 "addl %3, %1 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
228 "addl $16, %%eax \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
229 "jnz 1b \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
230 : "+r" (s1), "+r" (s2)
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
231 : "r" (block+64), "r" (stride)
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
232 : "%eax"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
233 );
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
234 }
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
235 #endif //CONFIG_ENCODERS
324
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
236
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
237 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
238 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
239 const DCTELEM *p;
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
240 uint8_t *pix;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
241
986e461dc072 Initial revision
glantau
parents:
diff changeset
242 /* read the pixels */
986e461dc072 Initial revision
glantau
parents:
diff changeset
243 p = block;
986e461dc072 Initial revision
glantau
parents:
diff changeset
244 pix = pixels;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
245 /* unrolled loop */
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
246 __asm __volatile(
151
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
247 "movq %3, %%mm0\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
248 "movq 8%3, %%mm1\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
249 "movq 16%3, %%mm2\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
250 "movq 24%3, %%mm3\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
251 "movq 32%3, %%mm4\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
252 "movq 40%3, %%mm5\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
253 "movq 48%3, %%mm6\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
254 "movq 56%3, %%mm7\n\t"
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
255 "packuswb %%mm1, %%mm0\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
256 "packuswb %%mm3, %%mm2\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
257 "packuswb %%mm5, %%mm4\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
258 "packuswb %%mm7, %%mm6\n\t"
151
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
259 "movq %%mm0, (%0)\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
260 "movq %%mm2, (%0, %1)\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
261 "movq %%mm4, (%0, %1, 2)\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
262 "movq %%mm6, (%0, %2)\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
263 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
264 :"memory");
986e461dc072 Initial revision
glantau
parents:
diff changeset
265 pix += line_size*4;
986e461dc072 Initial revision
glantau
parents:
diff changeset
266 p += 32;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
267
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
268 // if here would be an exact copy of the code above
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
269 // compiler would generate some very strange code
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
270 // thus using "r"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
271 __asm __volatile(
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
272 "movq (%3), %%mm0\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
273 "movq 8(%3), %%mm1\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
274 "movq 16(%3), %%mm2\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
275 "movq 24(%3), %%mm3\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
276 "movq 32(%3), %%mm4\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
277 "movq 40(%3), %%mm5\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
278 "movq 48(%3), %%mm6\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
279 "movq 56(%3), %%mm7\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
280 "packuswb %%mm1, %%mm0\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
281 "packuswb %%mm3, %%mm2\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
282 "packuswb %%mm5, %%mm4\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
283 "packuswb %%mm7, %%mm6\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
284 "movq %%mm0, (%0)\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
285 "movq %%mm2, (%0, %1)\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
286 "movq %%mm4, (%0, %1, 2)\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
287 "movq %%mm6, (%0, %2)\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
288 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
289 :"memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
290 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
291
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
292 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
293 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
294 const DCTELEM *p;
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
295 uint8_t *pix;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
296 int i;
986e461dc072 Initial revision
glantau
parents:
diff changeset
297
986e461dc072 Initial revision
glantau
parents:
diff changeset
298 /* read the pixels */
986e461dc072 Initial revision
glantau
parents:
diff changeset
299 p = block;
986e461dc072 Initial revision
glantau
parents:
diff changeset
300 pix = pixels;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
301 MOVQ_ZERO(mm7);
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
302 i = 4;
342
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
303 do {
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
304 __asm __volatile(
342
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
305 "movq (%2), %%mm0\n\t"
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
306 "movq 8(%2), %%mm1\n\t"
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
307 "movq 16(%2), %%mm2\n\t"
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
308 "movq 24(%2), %%mm3\n\t"
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
309 "movq %0, %%mm4\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
310 "movq %1, %%mm6\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
311 "movq %%mm4, %%mm5\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
312 "punpcklbw %%mm7, %%mm4\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
313 "punpckhbw %%mm7, %%mm5\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
314 "paddsw %%mm4, %%mm0\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
315 "paddsw %%mm5, %%mm1\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
316 "movq %%mm6, %%mm5\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
317 "punpcklbw %%mm7, %%mm6\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
318 "punpckhbw %%mm7, %%mm5\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
319 "paddsw %%mm6, %%mm2\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
320 "paddsw %%mm5, %%mm3\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
321 "packuswb %%mm1, %%mm0\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
322 "packuswb %%mm3, %%mm2\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
323 "movq %%mm0, %0\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
324 "movq %%mm2, %1\n\t"
151
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
325 :"+m"(*pix), "+m"(*(pix+line_size))
342
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
326 :"r"(p)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
327 :"memory");
986e461dc072 Initial revision
glantau
parents:
diff changeset
328 pix += line_size*2;
986e461dc072 Initial revision
glantau
parents:
diff changeset
329 p += 16;
342
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
330 } while (--i);
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
331 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
332
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
333 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
334 {
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
335 __asm __volatile(
420
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
336 "lea (%3, %3), %%eax \n\t"
422
aa4a1c6209bd * baling 8 seems to have the same speed
kabi
parents: 421
diff changeset
337 ".balign 8 \n\t"
420
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
338 "1: \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
339 "movq (%1), %%mm0 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
340 "movq (%1, %3), %%mm1 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
341 "movq %%mm0, (%2) \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
342 "movq %%mm1, (%2, %3) \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
343 "addl %%eax, %1 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
344 "addl %%eax, %2 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
345 "movq (%1), %%mm0 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
346 "movq (%1, %3), %%mm1 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
347 "movq %%mm0, (%2) \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
348 "movq %%mm1, (%2, %3) \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
349 "addl %%eax, %1 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
350 "addl %%eax, %2 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
351 "subl $4, %0 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
352 "jnz 1b \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
353 : "+g"(h), "+r" (pixels), "+r" (block)
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
354 : "r"(line_size)
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
355 : "%eax", "memory"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
356 );
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
357 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
358
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
359 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
360 {
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
361 __asm __volatile(
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
362 "lea (%3, %3), %%eax \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
363 ".balign 8 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
364 "1: \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
365 "movq (%1), %%mm0 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
366 "movq 8(%1), %%mm4 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
367 "movq (%1, %3), %%mm1 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
368 "movq 8(%1, %3), %%mm5 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
369 "movq %%mm0, (%2) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
370 "movq %%mm4, 8(%2) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
371 "movq %%mm1, (%2, %3) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
372 "movq %%mm5, 8(%2, %3) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
373 "addl %%eax, %1 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
374 "addl %%eax, %2 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
375 "movq (%1), %%mm0 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
376 "movq 8(%1), %%mm4 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
377 "movq (%1, %3), %%mm1 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
378 "movq 8(%1, %3), %%mm5 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
379 "movq %%mm0, (%2) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
380 "movq %%mm4, 8(%2) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
381 "movq %%mm1, (%2, %3) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
382 "movq %%mm5, 8(%2, %3) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
383 "addl %%eax, %1 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
384 "addl %%eax, %2 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
385 "subl $4, %0 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
386 "jnz 1b \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
387 : "+g"(h), "+r" (pixels), "+r" (block)
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
388 : "r"(line_size)
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
389 : "%eax", "memory"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
390 );
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
391 }
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
392
296
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
393 static void clear_blocks_mmx(DCTELEM *blocks)
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
394 {
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
395 __asm __volatile(
296
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
396 "pxor %%mm7, %%mm7 \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
397 "movl $-128*6, %%eax \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
398 "1: \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
399 "movq %%mm7, (%0, %%eax) \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
400 "movq %%mm7, 8(%0, %%eax) \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
401 "movq %%mm7, 16(%0, %%eax) \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
402 "movq %%mm7, 24(%0, %%eax) \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
403 "addl $32, %%eax \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
404 " js 1b \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
405 : : "r" (((int)blocks)+128*6)
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
406 : "%eax"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
407 );
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
408 }
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
409
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
410 #ifdef CONFIG_ENCODERS
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
411 static int pix_sum16_mmx(uint8_t * pix, int line_size){
688
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
412 const int h=16;
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
413 int sum;
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
414 int index= -line_size*h;
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
415
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
416 __asm __volatile(
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
417 "pxor %%mm7, %%mm7 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
418 "pxor %%mm6, %%mm6 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
419 "1: \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
420 "movq (%2, %1), %%mm0 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
421 "movq (%2, %1), %%mm1 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
422 "movq 8(%2, %1), %%mm2 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
423 "movq 8(%2, %1), %%mm3 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
424 "punpcklbw %%mm7, %%mm0 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
425 "punpckhbw %%mm7, %%mm1 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
426 "punpcklbw %%mm7, %%mm2 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
427 "punpckhbw %%mm7, %%mm3 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
428 "paddw %%mm0, %%mm1 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
429 "paddw %%mm2, %%mm3 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
430 "paddw %%mm1, %%mm3 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
431 "paddw %%mm3, %%mm6 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
432 "addl %3, %1 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
433 " js 1b \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
434 "movq %%mm6, %%mm5 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
435 "psrlq $32, %%mm6 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
436 "paddw %%mm5, %%mm6 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
437 "movq %%mm6, %%mm5 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
438 "psrlq $16, %%mm6 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
439 "paddw %%mm5, %%mm6 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
440 "movd %%mm6, %0 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
441 "andl $0xFFFF, %0 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
442 : "=&r" (sum), "+r" (index)
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
443 : "r" (pix - index), "r" (line_size)
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
444 );
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
445
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
446 return sum;
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
447 }
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
448 #endif //CONFIG_ENCODERS
688
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
449
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
450 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
451 int i=0;
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
452 asm volatile(
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
453 "1: \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
454 "movq (%1, %0), %%mm0 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
455 "movq (%2, %0), %%mm1 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
456 "paddb %%mm0, %%mm1 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
457 "movq %%mm1, (%2, %0) \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
458 "movq 8(%1, %0), %%mm0 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
459 "movq 8(%2, %0), %%mm1 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
460 "paddb %%mm0, %%mm1 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
461 "movq %%mm1, 8(%2, %0) \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
462 "addl $16, %0 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
463 "cmpl %3, %0 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
464 " jb 1b \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
465 : "+r" (i)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
466 : "r"(src), "r"(dst), "r"(w-15)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
467 );
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
468 for(; i<w; i++)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
469 dst[i+0] += src[i+0];
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
470 }
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
471
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
472 #define H263_LOOP_FILTER \
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
473 "pxor %%mm7, %%mm7 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
474 "movq %0, %%mm0 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
475 "movq %0, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
476 "movq %3, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
477 "movq %3, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
478 "punpcklbw %%mm7, %%mm0 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
479 "punpckhbw %%mm7, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
480 "punpcklbw %%mm7, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
481 "punpckhbw %%mm7, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
482 "psubw %%mm2, %%mm0 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
483 "psubw %%mm3, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
484 "movq %1, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
485 "movq %1, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
486 "movq %2, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
487 "movq %2, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
488 "punpcklbw %%mm7, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
489 "punpckhbw %%mm7, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
490 "punpcklbw %%mm7, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
491 "punpckhbw %%mm7, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
492 "psubw %%mm2, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
493 "psubw %%mm3, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
494 "psllw $2, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
495 "psllw $2, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
496 "paddw %%mm0, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
497 "paddw %%mm1, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
498 "pxor %%mm6, %%mm6 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
499 "pcmpgtw %%mm4, %%mm6 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
500 "pcmpgtw %%mm5, %%mm7 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
501 "pxor %%mm6, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
502 "pxor %%mm7, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
503 "psubw %%mm6, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
504 "psubw %%mm7, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
505 "psrlw $3, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
506 "psrlw $3, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
507 "packuswb %%mm5, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
508 "packsswb %%mm7, %%mm6 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
509 "pxor %%mm7, %%mm7 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
510 "movd %4, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
511 "punpcklbw %%mm2, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
512 "punpcklbw %%mm2, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
513 "punpcklbw %%mm2, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
514 "psubusb %%mm4, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
515 "movq %%mm2, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
516 "psubusb %%mm4, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
517 "psubb %%mm3, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
518 "movq %1, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
519 "movq %2, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
520 "pxor %%mm6, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
521 "pxor %%mm6, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
522 "paddusb %%mm2, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
523 "psubusb %%mm2, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
524 "pxor %%mm6, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
525 "pxor %%mm6, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
526 "paddusb %%mm2, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
527 "packsswb %%mm1, %%mm0 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
528 "pcmpgtb %%mm0, %%mm7 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
529 "pxor %%mm7, %%mm0 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
530 "psubb %%mm7, %%mm0 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
531 "movq %%mm0, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
532 "psubusb %%mm2, %%mm0 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
533 "psubb %%mm0, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
534 "pand %5, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
535 "psrlw $2, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
536 "pxor %%mm7, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
537 "psubb %%mm7, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
538 "movq %0, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
539 "movq %3, %%mm6 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
540 "psubb %%mm1, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
541 "paddb %%mm1, %%mm6 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
542
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
543 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
544 const int strength= ff_h263_loop_filter_strength[qscale];
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
545
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
546 asm volatile(
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
547
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
548 H263_LOOP_FILTER
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
549
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
550 "movq %%mm3, %1 \n\t"
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
551 "movq %%mm4, %2 \n\t"
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
552 "movq %%mm5, %0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
553 "movq %%mm6, %3 \n\t"
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
554 : "+m" (*(uint64_t*)(src - 2*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
555 "+m" (*(uint64_t*)(src - 1*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
556 "+m" (*(uint64_t*)(src + 0*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
557 "+m" (*(uint64_t*)(src + 1*stride))
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
558 : "g" (2*strength), "m"(ff_pb_FC)
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
559 );
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
560 }
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
561
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
562 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
563 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
564 "movd %4, %%mm0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
565 "movd %5, %%mm1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
566 "movd %6, %%mm2 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
567 "movd %7, %%mm3 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
568 "punpcklbw %%mm1, %%mm0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
569 "punpcklbw %%mm3, %%mm2 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
570 "movq %%mm0, %%mm1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
571 "punpcklwd %%mm2, %%mm0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
572 "punpckhwd %%mm2, %%mm1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
573 "movd %%mm0, %0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
574 "punpckhdq %%mm0, %%mm0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
575 "movd %%mm0, %1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
576 "movd %%mm1, %2 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
577 "punpckhdq %%mm1, %%mm1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
578 "movd %%mm1, %3 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
579
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
580 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
581 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
582 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
583 "=m" (*(uint32_t*)(dst + 3*dst_stride))
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
584 : "m" (*(uint32_t*)(src + 0*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
585 "m" (*(uint32_t*)(src + 1*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
586 "m" (*(uint32_t*)(src + 2*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
587 "m" (*(uint32_t*)(src + 3*src_stride))
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
588 );
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
589 }
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
590
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
591 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
592 const int strength= ff_h263_loop_filter_strength[qscale];
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
593 uint64_t temp[4] __attribute__ ((aligned(8)));
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
594 uint8_t *btemp= (uint8_t*)temp;
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
595
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
596 src -= 2;
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
597
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
598 transpose4x4(btemp , src , 8, stride);
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
599 transpose4x4(btemp+4, src + 4*stride, 8, stride);
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
600 asm volatile(
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
601 H263_LOOP_FILTER // 5 3 4 6
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
602
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
603 : "+m" (temp[0]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
604 "+m" (temp[1]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
605 "+m" (temp[2]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
606 "+m" (temp[3])
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
607 : "g" (2*strength), "m"(ff_pb_FC)
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
608 );
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
609
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
610 asm volatile(
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
611 "movq %%mm5, %%mm1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
612 "movq %%mm4, %%mm0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
613 "punpcklbw %%mm3, %%mm5 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
614 "punpcklbw %%mm6, %%mm4 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
615 "punpckhbw %%mm3, %%mm1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
616 "punpckhbw %%mm6, %%mm0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
617 "movq %%mm5, %%mm3 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
618 "movq %%mm1, %%mm6 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
619 "punpcklwd %%mm4, %%mm5 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
620 "punpcklwd %%mm0, %%mm1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
621 "punpckhwd %%mm4, %%mm3 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
622 "punpckhwd %%mm0, %%mm6 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
623 "movd %%mm5, %0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
624 "punpckhdq %%mm5, %%mm5 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
625 "movd %%mm5, %1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
626 "movd %%mm3, %2 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
627 "punpckhdq %%mm3, %%mm3 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
628 "movd %%mm3, %3 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
629 "movd %%mm1, %4 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
630 "punpckhdq %%mm1, %%mm1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
631 "movd %%mm1, %5 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
632 "movd %%mm6, %6 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
633 "punpckhdq %%mm6, %%mm6 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
634 "movd %%mm6, %7 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
635 : "=m" (*(uint32_t*)(src + 0*stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
636 "=m" (*(uint32_t*)(src + 1*stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
637 "=m" (*(uint32_t*)(src + 2*stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
638 "=m" (*(uint32_t*)(src + 3*stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
639 "=m" (*(uint32_t*)(src + 4*stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
640 "=m" (*(uint32_t*)(src + 5*stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
641 "=m" (*(uint32_t*)(src + 6*stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
642 "=m" (*(uint32_t*)(src + 7*stride))
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
643 );
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
644 }
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
645
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
646 #ifdef CONFIG_ENCODERS
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
647 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
648 int tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
649 asm volatile (
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
650 "movl $16,%%ecx\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
651 "pxor %%mm0,%%mm0\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
652 "pxor %%mm7,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
653 "1:\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
654 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
655 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
656
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
657 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
658
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
659 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
660 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
661
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
662 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
663 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
664 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
665
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
666 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
667 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
668
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
669 "pmaddwd %%mm3,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
670 "pmaddwd %%mm4,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
671
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
672 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
673 pix2^2+pix3^2+pix6^2+pix7^2) */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
674 "paddd %%mm3,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
675 "paddd %%mm2,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
676
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
677 "addl %2, %0\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
678 "paddd %%mm4,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
679 "dec %%ecx\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
680 "jnz 1b\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
681
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
682 "movq %%mm7,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
683 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
684 "paddd %%mm7,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
685 "movd %%mm1,%1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
686 : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" );
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
687 return tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
688 }
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
689
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
690 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
691 int tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
692 asm volatile (
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
693 "movl %4,%%ecx\n"
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
694 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
695 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
696 "1:\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
697 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
698 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
699 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
700 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
701
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
702 /* todo: mm1-mm2, mm3-mm4 */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
703 /* algo: substract mm1 from mm2 with saturation and vice versa */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
704 /* OR the results to get absolute difference */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
705 "movq %%mm1,%%mm5\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
706 "movq %%mm3,%%mm6\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
707 "psubusb %%mm2,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
708 "psubusb %%mm4,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
709 "psubusb %%mm5,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
710 "psubusb %%mm6,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
711
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
712 "por %%mm1,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
713 "por %%mm3,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
714
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
715 /* now convert to 16-bit vectors so we can square them */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
716 "movq %%mm2,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
717 "movq %%mm4,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
718
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
719 "punpckhbw %%mm0,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
720 "punpckhbw %%mm0,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
721 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
722 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
723
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
724 "pmaddwd %%mm2,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
725 "pmaddwd %%mm4,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
726 "pmaddwd %%mm1,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
727 "pmaddwd %%mm3,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
728
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
729 "addl %3,%0\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
730 "addl %3,%1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
731
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
732 "paddd %%mm2,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
733 "paddd %%mm4,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
734 "paddd %%mm1,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
735 "paddd %%mm3,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
736
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
737 "decl %%ecx\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
738 "jnz 1b\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
739
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
740 "movq %%mm7,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
741 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
742 "paddd %%mm7,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
743 "movd %%mm1,%2\n"
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
744 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
745 : "r" (line_size) , "m" (h)
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
746 : "%ecx");
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
747 return tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
748 }
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
749
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
750 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
751 int i=0;
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
752 asm volatile(
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
753 "1: \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
754 "movq (%2, %0), %%mm0 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
755 "movq (%1, %0), %%mm1 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
756 "psubb %%mm0, %%mm1 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
757 "movq %%mm1, (%3, %0) \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
758 "movq 8(%2, %0), %%mm0 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
759 "movq 8(%1, %0), %%mm1 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
760 "psubb %%mm0, %%mm1 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
761 "movq %%mm1, 8(%3, %0) \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
762 "addl $16, %0 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
763 "cmpl %4, %0 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
764 " jb 1b \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
765 : "+r" (i)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
766 : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
767 );
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
768 for(; i<w; i++)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
769 dst[i+0] = src1[i+0]-src2[i+0];
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
770 }
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
771
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
772 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
773 int i=0;
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
774 uint8_t l, lt;
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
775
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
776 asm volatile(
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
777 "1: \n\t"
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
778 "movq -1(%1, %0), %%mm0 \n\t" // LT
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
779 "movq (%1, %0), %%mm1 \n\t" // T
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
780 "movq -1(%2, %0), %%mm2 \n\t" // L
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
781 "movq (%2, %0), %%mm3 \n\t" // X
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
782 "movq %%mm2, %%mm4 \n\t" // L
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
783 "psubb %%mm0, %%mm2 \n\t"
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
784 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
785 "movq %%mm4, %%mm5 \n\t" // L
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
786 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
787 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
788 "pminub %%mm2, %%mm4 \n\t"
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
789 "pmaxub %%mm1, %%mm4 \n\t"
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
790 "psubb %%mm4, %%mm3 \n\t" // dst - pred
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
791 "movq %%mm3, (%3, %0) \n\t"
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
792 "addl $8, %0 \n\t"
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
793 "cmpl %4, %0 \n\t"
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
794 " jb 1b \n\t"
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
795 : "+r" (i)
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
796 : "r"(src1), "r"(src2), "r"(dst), "r"(w)
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
797 );
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
798
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
799 l= *left;
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
800 lt= *left_top;
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
801
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
802 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
803
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
804 *left_top= src1[w-1];
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
805 *left = src2[w-1];
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
806 }
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
807
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
808 #define LBUTTERFLY2(a1,b1,a2,b2)\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
809 "paddw " #b1 ", " #a1 " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
810 "paddw " #b2 ", " #a2 " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
811 "paddw " #b1 ", " #b1 " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
812 "paddw " #b2 ", " #b2 " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
813 "psubw " #a1 ", " #b1 " \n\t"\
1186
4f0072371bb9 10l (hadamard fix)
michaelni
parents: 1153
diff changeset
814 "psubw " #a2 ", " #b2 " \n\t"
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
815
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
816 #define HADAMARD48\
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
817 LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
818 LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
819 LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
820 LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
821 LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
822 LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
823
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
824 #define MMABS(a,z)\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
825 "pxor " #z ", " #z " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
826 "pcmpgtw " #a ", " #z " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
827 "pxor " #z ", " #a " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
828 "psubw " #z ", " #a " \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
829
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
830 #define MMABS_SUM(a,z, sum)\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
831 "pxor " #z ", " #z " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
832 "pcmpgtw " #a ", " #z " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
833 "pxor " #z ", " #a " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
834 "psubw " #z ", " #a " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
835 "paddusw " #a ", " #sum " \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
836
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
837 #define MMABS_MMX2(a,z)\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
838 "pxor " #z ", " #z " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
839 "psubw " #a ", " #z " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
840 "pmaxsw " #z ", " #a " \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
841
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
842 #define MMABS_SUM_MMX2(a,z, sum)\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
843 "pxor " #z ", " #z " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
844 "psubw " #a ", " #z " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
845 "pmaxsw " #z ", " #a " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
846 "paddusw " #a ", " #sum " \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
847
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
848 #define SBUTTERFLY(a,b,t,n)\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
849 "movq " #a ", " #t " \n\t" /* abcd */\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
850 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
851 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
852
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
853 #define TRANSPOSE4(a,b,c,d,t)\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
854 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
855 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
856 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
857 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
858
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
859 #define LOAD4(o, a, b, c, d)\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
860 "movq "#o"(%1), " #a " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
861 "movq "#o"+16(%1), " #b " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
862 "movq "#o"+32(%1), " #c " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
863 "movq "#o"+48(%1), " #d " \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
864
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
865 #define STORE4(o, a, b, c, d)\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
866 "movq "#a", "#o"(%1) \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
867 "movq "#b", "#o"+16(%1) \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
868 "movq "#c", "#o"+32(%1) \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
869 "movq "#d", "#o"+48(%1) \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
870
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
871 static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
872 uint64_t temp[16] __align8;
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
873 int sum=0;
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
874
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
875 assert(h==8);
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
876
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
877 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
878
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
879 asm volatile(
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
880 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
881 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
882
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
883 HADAMARD48
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
884
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
885 "movq %%mm7, 112(%1) \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
886
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
887 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
888 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
889
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
890 "movq 112(%1), %%mm7 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
891 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
892 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
893
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
894 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
895 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
896
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
897 HADAMARD48
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
898
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
899 "movq %%mm7, 120(%1) \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
900
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
901 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
902 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
903
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
904 "movq 120(%1), %%mm7 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
905 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
906 "movq %%mm7, %%mm5 \n\t"//FIXME remove
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
907 "movq %%mm6, %%mm7 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
908 "movq %%mm0, %%mm6 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
909 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
910
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
911 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
912 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
913
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
914 HADAMARD48
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
915 "movq %%mm7, 64(%1) \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
916 MMABS(%%mm0, %%mm7)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
917 MMABS_SUM(%%mm1, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
918 MMABS_SUM(%%mm2, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
919 MMABS_SUM(%%mm3, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
920 MMABS_SUM(%%mm4, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
921 MMABS_SUM(%%mm5, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
922 MMABS_SUM(%%mm6, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
923 "movq 64(%1), %%mm1 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
924 MMABS_SUM(%%mm1, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
925 "movq %%mm0, 64(%1) \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
926
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
927 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
928 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
929
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
930 HADAMARD48
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
931 "movq %%mm7, (%1) \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
932 MMABS(%%mm0, %%mm7)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
933 MMABS_SUM(%%mm1, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
934 MMABS_SUM(%%mm2, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
935 MMABS_SUM(%%mm3, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
936 MMABS_SUM(%%mm4, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
937 MMABS_SUM(%%mm5, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
938 MMABS_SUM(%%mm6, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
939 "movq (%1), %%mm1 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
940 MMABS_SUM(%%mm1, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
941 "movq 64(%1), %%mm1 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
942 MMABS_SUM(%%mm1, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
943
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
944 "movq %%mm0, %%mm1 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
945 "psrlq $32, %%mm0 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
946 "paddusw %%mm1, %%mm0 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
947 "movq %%mm0, %%mm1 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
948 "psrlq $16, %%mm0 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
949 "paddusw %%mm1, %%mm0 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
950 "movd %%mm0, %0 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
951
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
952 : "=r" (sum)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
953 : "r"(temp)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
954 );
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
955 return sum&0xFFFF;
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
956 }
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
957
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
958 static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
959 uint64_t temp[16] __align8;
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
960 int sum=0;
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
961
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
962 assert(h==8);
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
963
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
964 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
965
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
966 asm volatile(
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
967 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
968 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
969
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
970 HADAMARD48
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
971
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
972 "movq %%mm7, 112(%1) \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
973
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
974 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
975 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
976
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
977 "movq 112(%1), %%mm7 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
978 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
979 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
980
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
981 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
982 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
983
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
984 HADAMARD48
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
985
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
986 "movq %%mm7, 120(%1) \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
987
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
988 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
989 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
990
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
991 "movq 120(%1), %%mm7 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
992 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
993 "movq %%mm7, %%mm5 \n\t"//FIXME remove
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
994 "movq %%mm6, %%mm7 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
995 "movq %%mm0, %%mm6 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
996 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
997
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
998 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
999 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1000
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1001 HADAMARD48
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1002 "movq %%mm7, 64(%1) \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1003 MMABS_MMX2(%%mm0, %%mm7)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1004 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1005 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1006 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1007 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1008 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1009 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1010 "movq 64(%1), %%mm1 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1011 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1012 "movq %%mm0, 64(%1) \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1013
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1014 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1015 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1016
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1017 HADAMARD48
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1018 "movq %%mm7, (%1) \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1019 MMABS_MMX2(%%mm0, %%mm7)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1020 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1021 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1022 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1023 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1024 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1025 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1026 "movq (%1), %%mm1 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1027 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1028 "movq 64(%1), %%mm1 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1029 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1030
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1031 "movq %%mm0, %%mm1 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1032 "psrlq $32, %%mm0 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1033 "paddusw %%mm1, %%mm0 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1034 "movq %%mm0, %%mm1 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1035 "psrlq $16, %%mm0 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1036 "paddusw %%mm1, %%mm0 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1037 "movd %%mm0, %0 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1038
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1039 : "=r" (sum)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1040 : "r"(temp)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1041 );
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1042 return sum&0xFFFF;
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1043 }
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1044
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1045
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
1046 WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
1047 WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
1048 #endif //CONFIG_ENCODERS
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1049
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1050 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1051 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1052
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1053 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1054 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1055 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1056 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1057 "movq "#in7", " #m3 " \n\t" /* d */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1058 "movq "#in0", %%mm5 \n\t" /* D */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1059 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1060 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1061 "movq "#in1", %%mm5 \n\t" /* C */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1062 "movq "#in2", %%mm6 \n\t" /* B */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1063 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1064 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1065 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1066 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1067 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1068 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1069 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1070 "psraw $5, %%mm5 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1071 "packuswb %%mm5, %%mm5 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1072 OP(%%mm5, out, %%mm7, d)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1073
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1074 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
1075 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1076 uint64_t temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1077 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1078 asm volatile(\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1079 "pxor %%mm7, %%mm7 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1080 "1: \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1081 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1082 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1083 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1084 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1085 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1086 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1087 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1088 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1089 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1090 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1091 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1092 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1093 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1094 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1095 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1096 "paddw %%mm3, %%mm5 \n\t" /* b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1097 "paddw %%mm2, %%mm6 \n\t" /* c */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1098 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1099 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1100 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1101 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1102 "paddw %%mm4, %%mm0 \n\t" /* a */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1103 "paddw %%mm1, %%mm5 \n\t" /* d */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1104 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1105 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1106 "paddw %6, %%mm6 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1107 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1108 "psraw $5, %%mm0 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1109 "movq %%mm0, %5 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1110 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1111 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1112 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1113 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1114 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1115 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1116 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1117 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1118 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1119 "paddw %%mm0, %%mm2 \n\t" /* b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1120 "paddw %%mm5, %%mm3 \n\t" /* c */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1121 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1122 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1123 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1124 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1125 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1126 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1127 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1128 "paddw %%mm2, %%mm1 \n\t" /* a */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1129 "paddw %%mm6, %%mm4 \n\t" /* d */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1130 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1131 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1132 "paddw %6, %%mm1 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1133 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1134 "psraw $5, %%mm3 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1135 "movq %5, %%mm1 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1136 "packuswb %%mm3, %%mm1 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1137 OP_MMX2(%%mm1, (%1),%%mm4, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1138 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1139 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1140 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1141 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1142 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1143 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1144 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1145 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1146 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1147 "paddw %%mm1, %%mm5 \n\t" /* b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1148 "paddw %%mm4, %%mm0 \n\t" /* c */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1149 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1150 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1151 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1152 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1153 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1154 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1155 "paddw %%mm3, %%mm2 \n\t" /* d */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1156 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1157 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1158 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1159 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1160 "paddw %%mm2, %%mm6 \n\t" /* a */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1161 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1162 "paddw %6, %%mm0 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1163 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1164 "psraw $5, %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1165 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1166 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1167 "paddw %%mm5, %%mm3 \n\t" /* a */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1168 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1169 "paddw %%mm4, %%mm6 \n\t" /* b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1170 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1171 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1172 "paddw %%mm1, %%mm4 \n\t" /* c */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1173 "paddw %%mm2, %%mm5 \n\t" /* d */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1174 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1175 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1176 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1177 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1178 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1179 "paddw %6, %%mm4 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1180 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1181 "psraw $5, %%mm4 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1182 "packuswb %%mm4, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1183 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1184 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1185 "addl %3, %0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1186 "addl %4, %1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1187 "decl %2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1188 " jnz 1b \n\t"\
967
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
1189 : "+a"(src), "+c"(dst), "+m"(h)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1190 : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1191 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1192 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1193 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1194 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1195 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1196 int i;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1197 int16_t temp[16];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1198 /* quick HACK, XXX FIXME MUST be optimized */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1199 for(i=0; i<h; i++)\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1200 {\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1201 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1202 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1203 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1204 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1205 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1206 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1207 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1208 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1209 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1210 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1211 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1212 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1213 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1214 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1215 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1216 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1217 asm volatile(\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1218 "movq (%0), %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1219 "movq 8(%0), %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1220 "paddw %2, %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1221 "paddw %2, %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1222 "psraw $5, %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1223 "psraw $5, %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1224 "packuswb %%mm1, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1225 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1226 "movq 16(%0), %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1227 "movq 24(%0), %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1228 "paddw %2, %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1229 "paddw %2, %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1230 "psraw $5, %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1231 "psraw $5, %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1232 "packuswb %%mm1, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1233 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1234 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1235 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1236 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1237 dst+=dstStride;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1238 src+=srcStride;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1239 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1240 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1241 \
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
1242 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1243 uint64_t temp;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1244 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1245 asm volatile(\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1246 "pxor %%mm7, %%mm7 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1247 "1: \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1248 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1249 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1250 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1251 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1252 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1253 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1254 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1255 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1256 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1257 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1258 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1259 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1260 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1261 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1262 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1263 "paddw %%mm3, %%mm5 \n\t" /* b */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1264 "paddw %%mm2, %%mm6 \n\t" /* c */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1265 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1266 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1267 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1268 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1269 "paddw %%mm4, %%mm0 \n\t" /* a */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1270 "paddw %%mm1, %%mm5 \n\t" /* d */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1271 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1272 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1273 "paddw %6, %%mm6 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1274 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1275 "psraw $5, %%mm0 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1276 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1277 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1278 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1279 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1280 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1281 "paddw %%mm5, %%mm1 \n\t" /* a */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1282 "paddw %%mm6, %%mm2 \n\t" /* b */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1283 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1284 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1285 "paddw %%mm6, %%mm3 \n\t" /* c */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1286 "paddw %%mm5, %%mm4 \n\t" /* d */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1287 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1288 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1289 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1290 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1291 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1292 "paddw %6, %%mm1 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1293 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1294 "psraw $5, %%mm3 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1295 "packuswb %%mm3, %%mm0 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1296 OP_MMX2(%%mm0, (%1), %%mm4, q)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1297 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1298 "addl %3, %0 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1299 "addl %4, %1 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1300 "decl %2 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1301 " jnz 1b \n\t"\
967
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
1302 : "+a"(src), "+c"(dst), "+m"(h)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1303 : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1304 : "memory"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1305 );\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1306 }\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1307 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1308 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1309 int i;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1310 int16_t temp[8];\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1311 /* quick HACK, XXX FIXME MUST be optimized */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1312 for(i=0; i<h; i++)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1313 {\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1314 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1315 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1316 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1317 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1318 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1319 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1320 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1321 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1322 asm volatile(\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1323 "movq (%0), %%mm0 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1324 "movq 8(%0), %%mm1 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1325 "paddw %2, %%mm0 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1326 "paddw %2, %%mm1 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1327 "psraw $5, %%mm0 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1328 "psraw $5, %%mm1 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1329 "packuswb %%mm1, %%mm0 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1330 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1331 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1332 :"memory"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1333 );\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1334 dst+=dstStride;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1335 src+=srcStride;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1336 }\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1337 }
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1338
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1339 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1340 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1341 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1342 uint64_t temp[17*4];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1343 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1344 int count= 17;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1345 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1346 /*FIXME unroll */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1347 asm volatile(\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1348 "pxor %%mm7, %%mm7 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1349 "1: \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1350 "movq (%0), %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1351 "movq (%0), %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1352 "movq 8(%0), %%mm2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1353 "movq 8(%0), %%mm3 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1354 "punpcklbw %%mm7, %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1355 "punpckhbw %%mm7, %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1356 "punpcklbw %%mm7, %%mm2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1357 "punpckhbw %%mm7, %%mm3 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1358 "movq %%mm0, (%1) \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1359 "movq %%mm1, 17*8(%1) \n\t"\
967
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
1360 "movq %%mm2, 2*17*8(%1) \n\t"\
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
1361 "movq %%mm3, 3*17*8(%1) \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1362 "addl $8, %1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1363 "addl %3, %0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1364 "decl %2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1365 " jnz 1b \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1366 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
967
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
1367 : "r" (srcStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1368 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1369 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1370 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1371 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1372 count=4;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1373 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1374 /*FIXME reorder for speed */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1375 asm volatile(\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1376 /*"pxor %%mm7, %%mm7 \n\t"*/\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1377 "1: \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1378 "movq (%0), %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1379 "movq 8(%0), %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1380 "movq 16(%0), %%mm2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1381 "movq 24(%0), %%mm3 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1382 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1383 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1384 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1385 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1386 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1387 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1388 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1389 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1390 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1391 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1392 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1393 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1394 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1395 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1396 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1397 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1398 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1399 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1400 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1401 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1402 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1403 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1404 "addl %4, %1 \n\t" \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1405 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1406 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1407 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1408 "addl $136, %0 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1409 "addl %6, %1 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1410 "decl %2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1411 " jnz 1b \n\t"\
958
9bb668034ecf slowdown / gcc 2.95.* bug workaround (this should be reversed as soon as gcc 2.95.* support is droped)
michaelni
parents: 954
diff changeset
1412 \
967
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
1413 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1414 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1415 :"memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1416 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1417 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1418 \
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
1419 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1420 uint64_t temp[9*4];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1421 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1422 int count= 9;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1423 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1424 /*FIXME unroll */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1425 asm volatile(\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1426 "pxor %%mm7, %%mm7 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1427 "1: \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1428 "movq (%0), %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1429 "movq (%0), %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1430 "punpcklbw %%mm7, %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1431 "punpckhbw %%mm7, %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1432 "movq %%mm0, (%1) \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1433 "movq %%mm1, 9*8(%1) \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1434 "addl $8, %1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1435 "addl %3, %0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1436 "decl %2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1437 " jnz 1b \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1438 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1439 : "r" (srcStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1440 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1441 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1442 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1443 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1444 count=2;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1445 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1446 /*FIXME reorder for speed */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1447 asm volatile(\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1448 /*"pxor %%mm7, %%mm7 \n\t"*/\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1449 "1: \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1450 "movq (%0), %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1451 "movq 8(%0), %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1452 "movq 16(%0), %%mm2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1453 "movq 24(%0), %%mm3 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1454 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1455 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1456 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1457 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1458 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1459 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1460 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1461 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1462 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1463 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1464 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1465 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1466 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1467 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1468 "addl $72, %0 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1469 "addl %6, %1 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1470 "decl %2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1471 " jnz 1b \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1472 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1473 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1474 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1475 : "memory"\
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1476 );\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1477 }\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1478 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1479 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1480 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1481 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1482 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1483 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1484 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1485 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1486 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1487 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1488 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1489 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1490 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1491 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1492 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1493 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1494 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1495 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1496 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1497 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1498 OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1499 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1500 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1501 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1502 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1503 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1504 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1505 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1506 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1507 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1508 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1509 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1510 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1511 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1512 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1513 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1514 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1515 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1516 OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1517 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1518 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1519 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1520 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1521 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1522 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1523 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1524 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1525 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1526 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1527 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1528 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1529 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1530 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1531 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1532 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1533 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1534 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1535 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1536 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1537 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1538 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1539 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1540 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1541 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1542 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1543 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1544 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1545 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1546 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1547 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1548 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1549 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1550 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1551 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1552 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1553 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1554 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1555 uint64_t half[8 + 9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1556 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1557 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1558 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1559 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1560 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1561 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1562 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1563 uint64_t half[8 + 9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1564 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1565 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1566 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1567 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1568 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1569 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1570 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1571 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1572 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1573 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1574 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1575 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1576 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1577 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1578 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1579 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1580 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1581 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1582 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1583 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1584 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1585 uint64_t half[9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1586 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1587 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1588 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1589 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1590 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1591 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1592 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1593 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1594 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1595 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1596 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1597 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1598 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1599 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1600 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1601 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1602 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1603 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1604 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1605 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1606 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1607 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1608 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1609 OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1610 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1611 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1612 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1613 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1614 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1615 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1616 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1617 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1618 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1619 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1620 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1621 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1622 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1623 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1624 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1625 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1626 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1627 OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1628 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1629 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1630 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1631 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1632 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1633 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1634 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1635 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1636 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1637 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1638 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1639 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1640 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1641 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1642 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1643 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1644 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1645 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1646 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1647 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1648 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1649 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1650 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1651 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1652 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1653 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1654 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1655 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1656 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1657 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1658 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1659 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1660 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1661 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1662 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1663 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1664 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1665 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1666 uint64_t half[16*2 + 17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1667 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1668 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1669 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1670 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1671 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1672 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1673 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1674 uint64_t half[16*2 + 17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1675 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1676 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1677 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1678 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1679 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1680 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1681 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1682 uint64_t half[17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1683 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1684 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1685 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1686 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1687 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1688 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1689 uint64_t half[17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1690 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1691 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1692 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1693 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1694 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1695 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1696 uint64_t half[17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1697 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1698 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1699 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1700 }
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1701
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1702
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1703 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1704 #define AVG_3DNOW_OP(a,b,temp, size) \
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1705 "mov" #size " " #b ", " #temp " \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1706 "pavgusb " #temp ", " #a " \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1707 "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1708 #define AVG_MMX2_OP(a,b,temp, size) \
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1709 "mov" #size " " #b ", " #temp " \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1710 "pavgb " #temp ", " #a " \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1711 "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1712
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1713 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1714 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1715 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1716 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1717 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1718 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1719 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1720 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1721 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1722
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
1723 #if 0
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
1724 static void just_return() { return; }
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
1725 #endif
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
1726
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1727 #define SET_QPEL_FUNC(postfix1, postfix2) \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1728 c->put_ ## postfix1 = put_ ## postfix2;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1729 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1730 c->avg_ ## postfix1 = avg_ ## postfix2;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1731
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1732 /* external functions, from idct_mmx.c */
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1733 void ff_mmx_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1734 void ff_mmxext_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1735
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1736 /* XXX: those functions should be suppressed ASAP when all IDCTs are
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1737 converted */
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1738 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1739 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1740 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1741 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1742 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1743 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1744 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1745 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1746 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1747 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1748 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1749 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1750 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1751 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1752 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1753 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1754 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1755 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1756 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1757 }
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1758
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1759 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1760 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
1761 mm_flags = mm_support();
1115
74a46d77e061 * support FF_MM_FORCE
kabi
parents: 1092
diff changeset
1762
1122
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
1763 if (avctx->dsp_mask) {
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
1764 if (avctx->dsp_mask & FF_MM_FORCE)
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
1765 mm_flags |= (avctx->dsp_mask & 0xffff);
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
1766 else
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
1767 mm_flags &= ~(avctx->dsp_mask & 0xffff);
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
1768 }
1115
74a46d77e061 * support FF_MM_FORCE
kabi
parents: 1092
diff changeset
1769
631
47a8964ba5cd be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents: 629
diff changeset
1770 #if 0
47a8964ba5cd be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents: 629
diff changeset
1771 fprintf(stderr, "libavcodec: CPU flags:");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1772 if (mm_flags & MM_MMX)
631
47a8964ba5cd be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents: 629
diff changeset
1773 fprintf(stderr, " mmx");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1774 if (mm_flags & MM_MMXEXT)
631
47a8964ba5cd be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents: 629
diff changeset
1775 fprintf(stderr, " mmxext");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1776 if (mm_flags & MM_3DNOW)
631
47a8964ba5cd be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents: 629
diff changeset
1777 fprintf(stderr, " 3dnow");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1778 if (mm_flags & MM_SSE)
631
47a8964ba5cd be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents: 629
diff changeset
1779 fprintf(stderr, " sse");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1780 if (mm_flags & MM_SSE2)
631
47a8964ba5cd be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents: 629
diff changeset
1781 fprintf(stderr, " sse2");
47a8964ba5cd be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents: 629
diff changeset
1782 fprintf(stderr, "\n");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1783 #endif
986e461dc072 Initial revision
glantau
parents:
diff changeset
1784
986e461dc072 Initial revision
glantau
parents:
diff changeset
1785 if (mm_flags & MM_MMX) {
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1786 const int dct_algo = avctx->dct_algo;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1787 const int idct_algo= avctx->idct_algo;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1788
1232
e88d3b1fb2a1 more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents: 1186
diff changeset
1789 #ifdef CONFIG_ENCODERS
1565
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
1790 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
1791 if(mm_flags & MM_MMXEXT){
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
1792 c->fdct = ff_fdct_mmx2;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
1793 }else{
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
1794 c->fdct = ff_fdct_mmx;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
1795 }
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
1796 }
1232
e88d3b1fb2a1 more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents: 1186
diff changeset
1797 #endif //CONFIG_ENCODERS
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1798
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1799 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1800 c->idct_put= ff_simple_idct_put_mmx;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1801 c->idct_add= ff_simple_idct_add_mmx;
1324
7d328fd9d8a5 the return of the idct with 16bit output by ("Ivan Kalvachev" <ivan at cacad dot com>)
michaelni
parents: 1232
diff changeset
1802 c->idct = ff_simple_idct_mmx;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1803 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1804 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1805 if(mm_flags & MM_MMXEXT){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1806 c->idct_put= ff_libmpeg2mmx2_idct_put;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1807 c->idct_add= ff_libmpeg2mmx2_idct_add;
1324
7d328fd9d8a5 the return of the idct with 16bit output by ("Ivan Kalvachev" <ivan at cacad dot com>)
michaelni
parents: 1232
diff changeset
1808 c->idct = ff_mmxext_idct;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1809 }else{
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1810 c->idct_put= ff_libmpeg2mmx_idct_put;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1811 c->idct_add= ff_libmpeg2mmx_idct_add;
1324
7d328fd9d8a5 the return of the idct with 16bit output by ("Ivan Kalvachev" <ivan at cacad dot com>)
michaelni
parents: 1232
diff changeset
1812 c->idct = ff_mmx_idct;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1813 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1814 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1815 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1816
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
1817 #ifdef CONFIG_ENCODERS
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1818 c->get_pixels = get_pixels_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1819 c->diff_pixels = diff_pixels_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
1820 #endif //CONFIG_ENCODERS
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1821 c->put_pixels_clamped = put_pixels_clamped_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1822 c->add_pixels_clamped = add_pixels_clamped_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1823 c->clear_blocks = clear_blocks_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
1824 #ifdef CONFIG_ENCODERS
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1825 c->pix_sum = pix_sum16_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
1826 #endif //CONFIG_ENCODERS
415
1c3f42442fba * added simple test main - see comments about how to
kabi
parents: 402
diff changeset
1827
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1828 c->put_pixels_tab[0][0] = put_pixels16_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1829 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1830 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1831 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1832
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1833 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1834 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1835 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1836 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
1837
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1838 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1839 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1840 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1841 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
415
1c3f42442fba * added simple test main - see comments about how to
kabi
parents: 402
diff changeset
1842
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1843 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1844 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1845 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1846 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1847
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1848 c->put_pixels_tab[1][0] = put_pixels8_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1849 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1850 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1851 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1852
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1853 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1854 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1855 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1856 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
1857
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1858 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1859 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1860 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1861 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
1862
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1863 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1864 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1865 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1866 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1867
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1868 c->add_bytes= add_bytes_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
1869 #ifdef CONFIG_ENCODERS
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1870 c->diff_bytes= diff_bytes_mmx;
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1871
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1872 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1873 c->hadamard8_diff[1]= hadamard8_diff_mmx;
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1874
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
1875 c->pix_norm1 = pix_norm1_mmx;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
1876 c->sse[0] = sse16_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
1877 #endif //CONFIG_ENCODERS
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
1878
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
1879 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
1880 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1881
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1882 if (mm_flags & MM_MMXEXT) {
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1883 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1884 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
1885
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1886 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1887 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1888 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
415
1c3f42442fba * added simple test main - see comments about how to
kabi
parents: 402
diff changeset
1889
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1890 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1891 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
1892
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1893 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1894 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1895 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1896
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
1897 #ifdef CONFIG_ENCODERS
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1898 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1899 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
1900 #endif //CONFIG_ENCODERS
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1901
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1902 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1903 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1904 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1905 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1906 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1907 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1908 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1909 }
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1910
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1911 #if 1
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1912 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1913 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1914 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1915 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1916 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1917 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1918 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1919 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1920 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1921 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1922 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1923 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1924 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1925 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1926 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1927 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1928 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1929 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1930 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1931 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1932 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1933 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1934 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1935 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1936 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1937 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1938 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1939 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1940 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1941 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1942 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1943 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1944 #endif
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1945
1686
68abbec33289 Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents: 1648
diff changeset
1946 #ifdef CONFIG_ENCODERS
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1947 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1686
68abbec33289 Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents: 1648
diff changeset
1948 #endif //CONFIG_ENCODERS
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1949 } else if (mm_flags & MM_3DNOW) {
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1950 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1951 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
1952
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1953 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1954 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1955 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
1956
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1957 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1958 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1959
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1960 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1961 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
1962 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1963
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1964 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1965 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1966 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1967 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1968 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1969 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1970 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1971 }
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1972
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1973 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1974 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1975 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1976 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1977 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1978 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1979 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1980 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1981 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1982 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1983 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1984 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1985 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1986 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1987 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1988 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1989 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1990 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1991 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1992 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1993 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1994 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1995 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1996 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1997 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1998 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1999 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2000 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2001 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2002 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2003 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2004 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2005 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
2006 }
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2007
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
2008 #ifdef CONFIG_ENCODERS
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2009 dsputil_init_pix_mmx(c, avctx);
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
2010 #endif //CONFIG_ENCODERS
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2011 #if 0
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2012 // for speed testing
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2013 get_pixels = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2014 put_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2015 add_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2016
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2017 pix_abs16x16 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2018 pix_abs16x16_x2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2019 pix_abs16x16_y2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2020 pix_abs16x16_xy2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2021
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2022 put_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2023 put_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2024 put_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2025 put_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2026
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2027 put_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2028 put_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2029 put_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2030 put_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2031
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2032 avg_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2033 avg_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2034 avg_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2035 avg_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2036
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2037 avg_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2038 avg_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2039 avg_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2040 avg_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2041
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2042 //av_fdct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2043 //ff_idct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2044 #endif
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2045 }