annotate i386/dsputil_mmx.c @ 1985:b2bc62fdecc0 libavcodec

move the 0x80 vector outside of the function, thus saving the compiler the trouble of having to initialize each byte on the stack individually
author melanson
date Tue, 27 Apr 2004 04:06:24 +0000
parents ef919e9ef73e
children f65d87bfdd5a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1 /*
986e461dc072 Initial revision
glantau
parents:
diff changeset
2 * MMX optimized DSP utils
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
1739
07a484280a82 copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents: 1729
diff changeset
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
5 *
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
6 * This library is free software; you can redistribute it and/or
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
8 * License as published by the Free Software Foundation; either
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
9 * version 2 of the License, or (at your option) any later version.
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
10 *
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
11 * This library is distributed in the hope that it will be useful,
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
14 * Lesser General Public License for more details.
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
15 *
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
17 * License along with this library; if not, write to the Free Software
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
19 *
986e461dc072 Initial revision
glantau
parents:
diff changeset
20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
986e461dc072 Initial revision
glantau
parents:
diff changeset
21 */
986e461dc072 Initial revision
glantau
parents:
diff changeset
22
986e461dc072 Initial revision
glantau
parents:
diff changeset
23 #include "../dsputil.h"
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
24 #include "../simple_idct.h"
1984
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
25 #include "mmx.h"
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
26
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
27 //#undef NDEBUG
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
28 //#include <assert.h>
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
29
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
30 extern const uint8_t ff_h263_loop_filter_strength[32];
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
31
5
4479bcab253e suppressed no longer needed emms()
glantau
parents: 0
diff changeset
32 int mm_flags; /* multimedia extension flags */
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
33
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
34 /* pixel operations */
1845
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents: 1784
diff changeset
35 static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents: 1784
diff changeset
36 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents: 1784
diff changeset
37 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
38
1845
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents: 1784
diff changeset
39 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents: 1784
diff changeset
40 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents: 1784
diff changeset
41 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents: 1784
diff changeset
42 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
43
1845
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents: 1784
diff changeset
44 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
45
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
46 #define JUMPALIGN() __asm __volatile (".balign 8"::)
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
47 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
48
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
49 #define MOVQ_WONE(regd) \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
50 __asm __volatile ( \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
51 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
52 "psrlw $15, %%" #regd ::)
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
53
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
54 #define MOVQ_BFE(regd) \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
55 __asm __volatile ( \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
56 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
57 "paddb %%" #regd ", %%" #regd " \n\t" ::)
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
58
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
59 #ifndef PIC
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
60 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
61 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
62 #else
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
63 // for shared library it's better to use this way for accessing constants
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
64 // pcmpeqd -> -1
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
65 #define MOVQ_BONE(regd) \
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
66 __asm __volatile ( \
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
67 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
68 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
69 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
70
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
71 #define MOVQ_WTWO(regd) \
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
72 __asm __volatile ( \
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
73 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
74 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
75 "psllw $1, %%" #regd " \n\t"::)
387
b8f3affeb8e1 shared lib support (req by kabi) ...
michaelni
parents: 386
diff changeset
76
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
77 #endif
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
78
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
79 // using regr as temporary and for the output result
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
80 // first argument is unmodifed and second is trashed
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
81 // regfe is supposed to contain 0xfefefefefefefefe
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
82 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
83 "movq " #rega ", " #regr " \n\t"\
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
84 "pand " #regb ", " #regr " \n\t"\
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
85 "pxor " #rega ", " #regb " \n\t"\
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
86 "pand " #regfe "," #regb " \n\t"\
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
87 "psrlq $1, " #regb " \n\t"\
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
88 "paddb " #regb ", " #regr " \n\t"
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
89
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
90 #define PAVGB_MMX(rega, regb, regr, regfe) \
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
91 "movq " #rega ", " #regr " \n\t"\
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
92 "por " #regb ", " #regr " \n\t"\
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
93 "pxor " #rega ", " #regb " \n\t"\
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
94 "pand " #regfe "," #regb " \n\t"\
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
95 "psrlq $1, " #regb " \n\t"\
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
96 "psubb " #regb ", " #regr " \n\t"
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
97
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
98 // mm6 is supposed to contain 0xfefefefefefefefe
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
99 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
100 "movq " #rega ", " #regr " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
101 "movq " #regc ", " #regp " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
102 "pand " #regb ", " #regr " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
103 "pand " #regd ", " #regp " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
104 "pxor " #rega ", " #regb " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
105 "pxor " #regc ", " #regd " \n\t"\
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
106 "pand %%mm6, " #regb " \n\t"\
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
107 "pand %%mm6, " #regd " \n\t"\
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
108 "psrlq $1, " #regb " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
109 "psrlq $1, " #regd " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
110 "paddb " #regb ", " #regr " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
111 "paddb " #regd ", " #regp " \n\t"
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
112
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
113 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
114 "movq " #rega ", " #regr " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
115 "movq " #regc ", " #regp " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
116 "por " #regb ", " #regr " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
117 "por " #regd ", " #regp " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
118 "pxor " #rega ", " #regb " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
119 "pxor " #regc ", " #regd " \n\t"\
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
120 "pand %%mm6, " #regb " \n\t"\
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
121 "pand %%mm6, " #regd " \n\t"\
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
122 "psrlq $1, " #regd " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
123 "psrlq $1, " #regb " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
124 "psubb " #regb ", " #regr " \n\t"\
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
125 "psubb " #regd ", " #regp " \n\t"
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
126
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
127 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
128 /* MMX no rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
129 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
130 #define SET_RND MOVQ_WONE
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
131 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
132 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
133
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
134 #include "dsputil_mmx_rnd.h"
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
135
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
136 #undef DEF
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
137 #undef SET_RND
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
138 #undef PAVGBP
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
139 #undef PAVGB
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
140 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
141 /* MMX rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
142
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
143 #define DEF(x, y) x ## _ ## y ##_mmx
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
144 #define SET_RND MOVQ_WTWO
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
145 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
146 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
147
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
148 #include "dsputil_mmx_rnd.h"
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
149
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
150 #undef DEF
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
151 #undef SET_RND
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
152 #undef PAVGBP
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
153 #undef PAVGB
387
b8f3affeb8e1 shared lib support (req by kabi) ...
michaelni
parents: 386
diff changeset
154
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
155 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
156 /* 3Dnow specific */
986e461dc072 Initial revision
glantau
parents:
diff changeset
157
986e461dc072 Initial revision
glantau
parents:
diff changeset
158 #define DEF(x) x ## _3dnow
986e461dc072 Initial revision
glantau
parents:
diff changeset
159 /* for Athlons PAVGUSB is prefered */
986e461dc072 Initial revision
glantau
parents:
diff changeset
160 #define PAVGB "pavgusb"
986e461dc072 Initial revision
glantau
parents:
diff changeset
161
986e461dc072 Initial revision
glantau
parents:
diff changeset
162 #include "dsputil_mmx_avg.h"
986e461dc072 Initial revision
glantau
parents:
diff changeset
163
986e461dc072 Initial revision
glantau
parents:
diff changeset
164 #undef DEF
986e461dc072 Initial revision
glantau
parents:
diff changeset
165 #undef PAVGB
986e461dc072 Initial revision
glantau
parents:
diff changeset
166
986e461dc072 Initial revision
glantau
parents:
diff changeset
167 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
168 /* MMX2 specific */
986e461dc072 Initial revision
glantau
parents:
diff changeset
169
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
170 #define DEF(x) x ## _mmx2
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
171
986e461dc072 Initial revision
glantau
parents:
diff changeset
172 /* Introduced only in MMX2 set */
986e461dc072 Initial revision
glantau
parents:
diff changeset
173 #define PAVGB "pavgb"
986e461dc072 Initial revision
glantau
parents:
diff changeset
174
986e461dc072 Initial revision
glantau
parents:
diff changeset
175 #include "dsputil_mmx_avg.h"
986e461dc072 Initial revision
glantau
parents:
diff changeset
176
986e461dc072 Initial revision
glantau
parents:
diff changeset
177 #undef DEF
986e461dc072 Initial revision
glantau
parents:
diff changeset
178 #undef PAVGB
986e461dc072 Initial revision
glantau
parents:
diff changeset
179
986e461dc072 Initial revision
glantau
parents:
diff changeset
180 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
181 /* standard MMX */
986e461dc072 Initial revision
glantau
parents:
diff changeset
182
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
183 #ifdef CONFIG_ENCODERS
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
184 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
185 {
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
186 asm volatile(
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
187 "movl $-128, %%eax \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
188 "pxor %%mm7, %%mm7 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
189 ".balign 16 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
190 "1: \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
191 "movq (%0), %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
192 "movq (%0, %2), %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
193 "movq %%mm0, %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
194 "movq %%mm2, %%mm3 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
195 "punpcklbw %%mm7, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
196 "punpckhbw %%mm7, %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
197 "punpcklbw %%mm7, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
198 "punpckhbw %%mm7, %%mm3 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
199 "movq %%mm0, (%1, %%eax)\n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
200 "movq %%mm1, 8(%1, %%eax)\n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
201 "movq %%mm2, 16(%1, %%eax)\n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
202 "movq %%mm3, 24(%1, %%eax)\n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
203 "addl %3, %0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
204 "addl $32, %%eax \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
205 "js 1b \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
206 : "+r" (pixels)
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
207 : "r" (block+64), "r" (line_size), "r" (line_size*2)
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
208 : "%eax"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
209 );
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
210 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
211
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
212 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
324
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
213 {
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
214 asm volatile(
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
215 "pxor %%mm7, %%mm7 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
216 "movl $-128, %%eax \n\t"
324
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
217 ".balign 16 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
218 "1: \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
219 "movq (%0), %%mm0 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
220 "movq (%1), %%mm2 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
221 "movq %%mm0, %%mm1 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
222 "movq %%mm2, %%mm3 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
223 "punpcklbw %%mm7, %%mm0 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
224 "punpckhbw %%mm7, %%mm1 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
225 "punpcklbw %%mm7, %%mm2 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
226 "punpckhbw %%mm7, %%mm3 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
227 "psubw %%mm2, %%mm0 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
228 "psubw %%mm3, %%mm1 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
229 "movq %%mm0, (%2, %%eax)\n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
230 "movq %%mm1, 8(%2, %%eax)\n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
231 "addl %3, %0 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
232 "addl %3, %1 \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
233 "addl $16, %%eax \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
234 "jnz 1b \n\t"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
235 : "+r" (s1), "+r" (s2)
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
236 : "r" (block+64), "r" (stride)
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
237 : "%eax"
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
238 );
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
239 }
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
240 #endif //CONFIG_ENCODERS
324
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
241
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
242 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
243 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
244 const DCTELEM *p;
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
245 uint8_t *pix;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
246
986e461dc072 Initial revision
glantau
parents:
diff changeset
247 /* read the pixels */
986e461dc072 Initial revision
glantau
parents:
diff changeset
248 p = block;
986e461dc072 Initial revision
glantau
parents:
diff changeset
249 pix = pixels;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
250 /* unrolled loop */
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
251 __asm __volatile(
151
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
252 "movq %3, %%mm0\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
253 "movq 8%3, %%mm1\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
254 "movq 16%3, %%mm2\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
255 "movq 24%3, %%mm3\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
256 "movq 32%3, %%mm4\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
257 "movq 40%3, %%mm5\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
258 "movq 48%3, %%mm6\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
259 "movq 56%3, %%mm7\n\t"
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
260 "packuswb %%mm1, %%mm0\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
261 "packuswb %%mm3, %%mm2\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
262 "packuswb %%mm5, %%mm4\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
263 "packuswb %%mm7, %%mm6\n\t"
151
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
264 "movq %%mm0, (%0)\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
265 "movq %%mm2, (%0, %1)\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
266 "movq %%mm4, (%0, %1, 2)\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
267 "movq %%mm6, (%0, %2)\n\t"
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
268 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
269 :"memory");
986e461dc072 Initial revision
glantau
parents:
diff changeset
270 pix += line_size*4;
986e461dc072 Initial revision
glantau
parents:
diff changeset
271 p += 32;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
272
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
273 // if here would be an exact copy of the code above
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
274 // compiler would generate some very strange code
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
275 // thus using "r"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
276 __asm __volatile(
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
277 "movq (%3), %%mm0\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
278 "movq 8(%3), %%mm1\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
279 "movq 16(%3), %%mm2\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
280 "movq 24(%3), %%mm3\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
281 "movq 32(%3), %%mm4\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
282 "movq 40(%3), %%mm5\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
283 "movq 48(%3), %%mm6\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
284 "movq 56(%3), %%mm7\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
285 "packuswb %%mm1, %%mm0\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
286 "packuswb %%mm3, %%mm2\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
287 "packuswb %%mm5, %%mm4\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
288 "packuswb %%mm7, %%mm6\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
289 "movq %%mm0, (%0)\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
290 "movq %%mm2, (%0, %1)\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
291 "movq %%mm4, (%0, %1, 2)\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
292 "movq %%mm6, (%0, %2)\n\t"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
293 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
294 :"memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
295 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
296
1985
b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents: 1984
diff changeset
297 static unsigned char __align8 vector128[8] =
b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents: 1984
diff changeset
298 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents: 1984
diff changeset
299
1984
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
300 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
301 {
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
302 int i;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
303
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
304 movq_m2r(*vector128, mm1);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
305 for (i = 0; i < 8; i++) {
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
306 movq_m2r(*(block), mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
307 packsswb_m2r(*(block + 4), mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
308 block += 8;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
309 paddb_r2r(mm1, mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
310 movq_r2m(mm0, *pixels);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
311 pixels += line_size;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
312 }
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
313 }
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
314
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
315 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
316 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
317 const DCTELEM *p;
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
318 uint8_t *pix;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
319 int i;
986e461dc072 Initial revision
glantau
parents:
diff changeset
320
986e461dc072 Initial revision
glantau
parents:
diff changeset
321 /* read the pixels */
986e461dc072 Initial revision
glantau
parents:
diff changeset
322 p = block;
986e461dc072 Initial revision
glantau
parents:
diff changeset
323 pix = pixels;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
324 MOVQ_ZERO(mm7);
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
325 i = 4;
342
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
326 do {
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
327 __asm __volatile(
342
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
328 "movq (%2), %%mm0\n\t"
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
329 "movq 8(%2), %%mm1\n\t"
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
330 "movq 16(%2), %%mm2\n\t"
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
331 "movq 24(%2), %%mm3\n\t"
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
332 "movq %0, %%mm4\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
333 "movq %1, %%mm6\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
334 "movq %%mm4, %%mm5\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
335 "punpcklbw %%mm7, %%mm4\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
336 "punpckhbw %%mm7, %%mm5\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
337 "paddsw %%mm4, %%mm0\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
338 "paddsw %%mm5, %%mm1\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
339 "movq %%mm6, %%mm5\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
340 "punpcklbw %%mm7, %%mm6\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
341 "punpckhbw %%mm7, %%mm5\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
342 "paddsw %%mm6, %%mm2\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
343 "paddsw %%mm5, %%mm3\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
344 "packuswb %%mm1, %%mm0\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
345 "packuswb %%mm3, %%mm2\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
346 "movq %%mm0, %0\n\t"
986e461dc072 Initial revision
glantau
parents:
diff changeset
347 "movq %%mm2, %1\n\t"
151
ae0516eadae2 fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents: 42
diff changeset
348 :"+m"(*pix), "+m"(*(pix+line_size))
342
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
349 :"r"(p)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
350 :"memory");
986e461dc072 Initial revision
glantau
parents:
diff changeset
351 pix += line_size*2;
986e461dc072 Initial revision
glantau
parents:
diff changeset
352 p += 16;
342
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
353 } while (--i);
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
354 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
355
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
356 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
357 {
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
358 __asm __volatile(
420
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
359 "lea (%3, %3), %%eax \n\t"
422
aa4a1c6209bd * baling 8 seems to have the same speed
kabi
parents: 421
diff changeset
360 ".balign 8 \n\t"
420
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
361 "1: \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
362 "movq (%1), %%mm0 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
363 "movq (%1, %3), %%mm1 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
364 "movq %%mm0, (%2) \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
365 "movq %%mm1, (%2, %3) \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
366 "addl %%eax, %1 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
367 "addl %%eax, %2 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
368 "movq (%1), %%mm0 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
369 "movq (%1, %3), %%mm1 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
370 "movq %%mm0, (%2) \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
371 "movq %%mm1, (%2, %3) \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
372 "addl %%eax, %1 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
373 "addl %%eax, %2 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
374 "subl $4, %0 \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
375 "jnz 1b \n\t"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
376 : "+g"(h), "+r" (pixels), "+r" (block)
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
377 : "r"(line_size)
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
378 : "%eax", "memory"
bbaf743f353f * cleanup for put_pixels_mmx
kabi
parents: 418
diff changeset
379 );
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
380 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
381
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
382 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
383 {
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
384 __asm __volatile(
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
385 "lea (%3, %3), %%eax \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
386 ".balign 8 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
387 "1: \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
388 "movq (%1), %%mm0 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
389 "movq 8(%1), %%mm4 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
390 "movq (%1, %3), %%mm1 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
391 "movq 8(%1, %3), %%mm5 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
392 "movq %%mm0, (%2) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
393 "movq %%mm4, 8(%2) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
394 "movq %%mm1, (%2, %3) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
395 "movq %%mm5, 8(%2, %3) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
396 "addl %%eax, %1 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
397 "addl %%eax, %2 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
398 "movq (%1), %%mm0 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
399 "movq 8(%1), %%mm4 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
400 "movq (%1, %3), %%mm1 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
401 "movq 8(%1, %3), %%mm5 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
402 "movq %%mm0, (%2) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
403 "movq %%mm4, 8(%2) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
404 "movq %%mm1, (%2, %3) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
405 "movq %%mm5, 8(%2, %3) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
406 "addl %%eax, %1 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
407 "addl %%eax, %2 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
408 "subl $4, %0 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
409 "jnz 1b \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
410 : "+g"(h), "+r" (pixels), "+r" (block)
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
411 : "r"(line_size)
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
412 : "%eax", "memory"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
413 );
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
414 }
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
415
296
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
416 static void clear_blocks_mmx(DCTELEM *blocks)
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
417 {
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
418 __asm __volatile(
296
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
419 "pxor %%mm7, %%mm7 \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
420 "movl $-128*6, %%eax \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
421 "1: \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
422 "movq %%mm7, (%0, %%eax) \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
423 "movq %%mm7, 8(%0, %%eax) \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
424 "movq %%mm7, 16(%0, %%eax) \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
425 "movq %%mm7, 24(%0, %%eax) \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
426 "addl $32, %%eax \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
427 " js 1b \n\t"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
428 : : "r" (((int)blocks)+128*6)
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
429 : "%eax"
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
430 );
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
431 }
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
432
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
433 #ifdef CONFIG_ENCODERS
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
434 static int pix_sum16_mmx(uint8_t * pix, int line_size){
688
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
435 const int h=16;
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
436 int sum;
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
437 int index= -line_size*h;
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
438
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
439 __asm __volatile(
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
440 "pxor %%mm7, %%mm7 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
441 "pxor %%mm6, %%mm6 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
442 "1: \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
443 "movq (%2, %1), %%mm0 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
444 "movq (%2, %1), %%mm1 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
445 "movq 8(%2, %1), %%mm2 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
446 "movq 8(%2, %1), %%mm3 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
447 "punpcklbw %%mm7, %%mm0 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
448 "punpckhbw %%mm7, %%mm1 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
449 "punpcklbw %%mm7, %%mm2 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
450 "punpckhbw %%mm7, %%mm3 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
451 "paddw %%mm0, %%mm1 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
452 "paddw %%mm2, %%mm3 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
453 "paddw %%mm1, %%mm3 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
454 "paddw %%mm3, %%mm6 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
455 "addl %3, %1 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
456 " js 1b \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
457 "movq %%mm6, %%mm5 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
458 "psrlq $32, %%mm6 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
459 "paddw %%mm5, %%mm6 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
460 "movq %%mm6, %%mm5 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
461 "psrlq $16, %%mm6 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
462 "paddw %%mm5, %%mm6 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
463 "movd %%mm6, %0 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
464 "andl $0xFFFF, %0 \n\t"
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
465 : "=&r" (sum), "+r" (index)
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
466 : "r" (pix - index), "r" (line_size)
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
467 );
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
468
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
469 return sum;
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
470 }
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
471 #endif //CONFIG_ENCODERS
688
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
472
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
473 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
474 int i=0;
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
475 asm volatile(
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
476 "1: \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
477 "movq (%1, %0), %%mm0 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
478 "movq (%2, %0), %%mm1 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
479 "paddb %%mm0, %%mm1 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
480 "movq %%mm1, (%2, %0) \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
481 "movq 8(%1, %0), %%mm0 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
482 "movq 8(%2, %0), %%mm1 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
483 "paddb %%mm0, %%mm1 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
484 "movq %%mm1, 8(%2, %0) \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
485 "addl $16, %0 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
486 "cmpl %3, %0 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
487 " jb 1b \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
488 : "+r" (i)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
489 : "r"(src), "r"(dst), "r"(w-15)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
490 );
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
491 for(; i<w; i++)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
492 dst[i+0] += src[i+0];
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
493 }
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
494
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
495 #define H263_LOOP_FILTER \
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
496 "pxor %%mm7, %%mm7 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
497 "movq %0, %%mm0 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
498 "movq %0, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
499 "movq %3, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
500 "movq %3, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
501 "punpcklbw %%mm7, %%mm0 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
502 "punpckhbw %%mm7, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
503 "punpcklbw %%mm7, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
504 "punpckhbw %%mm7, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
505 "psubw %%mm2, %%mm0 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
506 "psubw %%mm3, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
507 "movq %1, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
508 "movq %1, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
509 "movq %2, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
510 "movq %2, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
511 "punpcklbw %%mm7, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
512 "punpckhbw %%mm7, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
513 "punpcklbw %%mm7, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
514 "punpckhbw %%mm7, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
515 "psubw %%mm2, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
516 "psubw %%mm3, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
517 "psllw $2, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
518 "psllw $2, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
519 "paddw %%mm0, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
520 "paddw %%mm1, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
521 "pxor %%mm6, %%mm6 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
522 "pcmpgtw %%mm4, %%mm6 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
523 "pcmpgtw %%mm5, %%mm7 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
524 "pxor %%mm6, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
525 "pxor %%mm7, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
526 "psubw %%mm6, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
527 "psubw %%mm7, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
528 "psrlw $3, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
529 "psrlw $3, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
530 "packuswb %%mm5, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
531 "packsswb %%mm7, %%mm6 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
532 "pxor %%mm7, %%mm7 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
533 "movd %4, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
534 "punpcklbw %%mm2, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
535 "punpcklbw %%mm2, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
536 "punpcklbw %%mm2, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
537 "psubusb %%mm4, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
538 "movq %%mm2, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
539 "psubusb %%mm4, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
540 "psubb %%mm3, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
541 "movq %1, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
542 "movq %2, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
543 "pxor %%mm6, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
544 "pxor %%mm6, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
545 "paddusb %%mm2, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
546 "psubusb %%mm2, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
547 "pxor %%mm6, %%mm3 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
548 "pxor %%mm6, %%mm4 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
549 "paddusb %%mm2, %%mm2 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
550 "packsswb %%mm1, %%mm0 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
551 "pcmpgtb %%mm0, %%mm7 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
552 "pxor %%mm7, %%mm0 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
553 "psubb %%mm7, %%mm0 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
554 "movq %%mm0, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
555 "psubusb %%mm2, %%mm0 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
556 "psubb %%mm0, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
557 "pand %5, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
558 "psrlw $2, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
559 "pxor %%mm7, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
560 "psubb %%mm7, %%mm1 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
561 "movq %0, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
562 "movq %3, %%mm6 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
563 "psubb %%mm1, %%mm5 \n\t"\
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
564 "paddb %%mm1, %%mm6 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
565
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
566 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
567 const int strength= ff_h263_loop_filter_strength[qscale];
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
568
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
569 asm volatile(
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
570
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
571 H263_LOOP_FILTER
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
572
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
573 "movq %%mm3, %1 \n\t"
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
574 "movq %%mm4, %2 \n\t"
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
575 "movq %%mm5, %0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
576 "movq %%mm6, %3 \n\t"
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
577 : "+m" (*(uint64_t*)(src - 2*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
578 "+m" (*(uint64_t*)(src - 1*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
579 "+m" (*(uint64_t*)(src + 0*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
580 "+m" (*(uint64_t*)(src + 1*stride))
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
581 : "g" (2*strength), "m"(ff_pb_FC)
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
582 );
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
583 }
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
584
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
585 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
586 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
587 "movd %4, %%mm0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
588 "movd %5, %%mm1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
589 "movd %6, %%mm2 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
590 "movd %7, %%mm3 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
591 "punpcklbw %%mm1, %%mm0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
592 "punpcklbw %%mm3, %%mm2 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
593 "movq %%mm0, %%mm1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
594 "punpcklwd %%mm2, %%mm0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
595 "punpckhwd %%mm2, %%mm1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
596 "movd %%mm0, %0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
597 "punpckhdq %%mm0, %%mm0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
598 "movd %%mm0, %1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
599 "movd %%mm1, %2 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
600 "punpckhdq %%mm1, %%mm1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
601 "movd %%mm1, %3 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
602
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
603 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
604 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
605 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
606 "=m" (*(uint32_t*)(dst + 3*dst_stride))
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
607 : "m" (*(uint32_t*)(src + 0*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
608 "m" (*(uint32_t*)(src + 1*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
609 "m" (*(uint32_t*)(src + 2*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
610 "m" (*(uint32_t*)(src + 3*src_stride))
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
611 );
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
612 }
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
613
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
614 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
615 const int strength= ff_h263_loop_filter_strength[qscale];
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
616 uint64_t temp[4] __attribute__ ((aligned(8)));
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
617 uint8_t *btemp= (uint8_t*)temp;
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
618
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
619 src -= 2;
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
620
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
621 transpose4x4(btemp , src , 8, stride);
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
622 transpose4x4(btemp+4, src + 4*stride, 8, stride);
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
623 asm volatile(
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
624 H263_LOOP_FILTER // 5 3 4 6
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
625
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
626 : "+m" (temp[0]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
627 "+m" (temp[1]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
628 "+m" (temp[2]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
629 "+m" (temp[3])
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
630 : "g" (2*strength), "m"(ff_pb_FC)
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
631 );
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
632
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
633 asm volatile(
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
634 "movq %%mm5, %%mm1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
635 "movq %%mm4, %%mm0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
636 "punpcklbw %%mm3, %%mm5 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
637 "punpcklbw %%mm6, %%mm4 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
638 "punpckhbw %%mm3, %%mm1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
639 "punpckhbw %%mm6, %%mm0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
640 "movq %%mm5, %%mm3 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
641 "movq %%mm1, %%mm6 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
642 "punpcklwd %%mm4, %%mm5 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
643 "punpcklwd %%mm0, %%mm1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
644 "punpckhwd %%mm4, %%mm3 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
645 "punpckhwd %%mm0, %%mm6 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
646 "movd %%mm5, %0 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
647 "punpckhdq %%mm5, %%mm5 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
648 "movd %%mm5, %1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
649 "movd %%mm3, %2 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
650 "punpckhdq %%mm3, %%mm3 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
651 "movd %%mm3, %3 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
652 "movd %%mm1, %4 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
653 "punpckhdq %%mm1, %%mm1 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
654 "movd %%mm1, %5 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
655 "movd %%mm6, %6 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
656 "punpckhdq %%mm6, %%mm6 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
657 "movd %%mm6, %7 \n\t"
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
658 : "=m" (*(uint32_t*)(src + 0*stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
659 "=m" (*(uint32_t*)(src + 1*stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
660 "=m" (*(uint32_t*)(src + 2*stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
661 "=m" (*(uint32_t*)(src + 3*stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
662 "=m" (*(uint32_t*)(src + 4*stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
663 "=m" (*(uint32_t*)(src + 5*stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
664 "=m" (*(uint32_t*)(src + 6*stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
665 "=m" (*(uint32_t*)(src + 7*stride))
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
666 );
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
667 }
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
668
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
669 #ifdef CONFIG_ENCODERS
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
670 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
671 int tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
672 asm volatile (
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
673 "movl $16,%%ecx\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
674 "pxor %%mm0,%%mm0\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
675 "pxor %%mm7,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
676 "1:\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
677 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
678 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
679
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
680 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
681
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
682 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
683 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
684
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
685 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
686 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
687 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
688
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
689 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
690 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
691
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
692 "pmaddwd %%mm3,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
693 "pmaddwd %%mm4,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
694
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
695 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
696 pix2^2+pix3^2+pix6^2+pix7^2) */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
697 "paddd %%mm3,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
698 "paddd %%mm2,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
699
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
700 "addl %2, %0\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
701 "paddd %%mm4,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
702 "dec %%ecx\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
703 "jnz 1b\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
704
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
705 "movq %%mm7,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
706 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
707 "paddd %%mm7,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
708 "movd %%mm1,%1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
709 : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" );
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
710 return tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
711 }
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
712
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
713 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
714 int tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
715 asm volatile (
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
716 "movl %4,%%ecx\n"
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
717 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
718 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
719 "1:\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
720 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
721 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
722 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
723 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
724
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
725 /* todo: mm1-mm2, mm3-mm4 */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
726 /* algo: substract mm1 from mm2 with saturation and vice versa */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
727 /* OR the results to get absolute difference */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
728 "movq %%mm1,%%mm5\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
729 "movq %%mm3,%%mm6\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
730 "psubusb %%mm2,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
731 "psubusb %%mm4,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
732 "psubusb %%mm5,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
733 "psubusb %%mm6,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
734
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
735 "por %%mm1,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
736 "por %%mm3,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
737
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
738 /* now convert to 16-bit vectors so we can square them */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
739 "movq %%mm2,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
740 "movq %%mm4,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
741
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
742 "punpckhbw %%mm0,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
743 "punpckhbw %%mm0,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
744 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
745 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
746
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
747 "pmaddwd %%mm2,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
748 "pmaddwd %%mm4,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
749 "pmaddwd %%mm1,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
750 "pmaddwd %%mm3,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
751
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
752 "addl %3,%0\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
753 "addl %3,%1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
754
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
755 "paddd %%mm2,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
756 "paddd %%mm4,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
757 "paddd %%mm1,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
758 "paddd %%mm3,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
759
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
760 "decl %%ecx\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
761 "jnz 1b\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
762
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
763 "movq %%mm7,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
764 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
765 "paddd %%mm7,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
766 "movd %%mm1,%2\n"
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
767 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
768 : "r" (line_size) , "m" (h)
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
769 : "%ecx");
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
770 return tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
771 }
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
772
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
773 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
774 int tmp;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
775
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
776 assert( (((int)pix) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
777 assert((line_size &7) ==0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
778
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
779 #define SUM(in0, in1, out0, out1) \
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
780 "movq (%0), %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
781 "movq 8(%0), %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
782 "addl %2,%0\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
783 "movq %%mm2, " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
784 "movq %%mm3, " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
785 "psubusb " #in0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
786 "psubusb " #in1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
787 "psubusb " #out0 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
788 "psubusb " #out1 ", " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
789 "por %%mm2, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
790 "por %%mm3, " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
791 "movq " #in0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
792 "movq " #in1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
793 "punpcklbw %%mm7, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
794 "punpcklbw %%mm7, " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
795 "punpckhbw %%mm7, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
796 "punpckhbw %%mm7, %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
797 "paddw " #in1 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
798 "paddw %%mm3, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
799 "paddw %%mm2, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
800 "paddw " #in0 ", %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
801
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
802
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
803 asm volatile (
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
804 "movl %3,%%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
805 "pxor %%mm6,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
806 "pxor %%mm7,%%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
807 "movq (%0),%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
808 "movq 8(%0),%%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
809 "addl %2,%0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
810 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
811 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
812 "1:\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
813
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
814 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
815
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
816 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
817
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
818 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
819 "jnz 1b\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
820
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
821 "movq %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
822 "psrlq $32, %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
823 "paddw %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
824 "movq %%mm0,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
825 "psrlq $16, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
826 "paddw %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
827 "movd %%mm0,%1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
828 : "+r" (pix), "=r"(tmp)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
829 : "r" (line_size) , "m" (h)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
830 : "%ecx");
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
831 return tmp & 0xFFFF;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
832 }
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
833 #undef SUM
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
834
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
835 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
836 int tmp;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
837
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
838 assert( (((int)pix) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
839 assert((line_size &7) ==0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
840
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
841 #define SUM(in0, in1, out0, out1) \
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
842 "movq (%0), " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
843 "movq 8(%0), " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
844 "addl %2,%0\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
845 "psadbw " #out0 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
846 "psadbw " #out1 ", " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
847 "paddw " #in1 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
848 "paddw " #in0 ", %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
849
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
850 asm volatile (
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
851 "movl %3,%%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
852 "pxor %%mm6,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
853 "pxor %%mm7,%%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
854 "movq (%0),%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
855 "movq 8(%0),%%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
856 "addl %2,%0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
857 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
858 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
859 "1:\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
860
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
861 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
862
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
863 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
864
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
865 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
866 "jnz 1b\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
867
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
868 "movd %%mm6,%1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
869 : "+r" (pix), "=r"(tmp)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
870 : "r" (line_size) , "m" (h)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
871 : "%ecx");
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
872 return tmp;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
873 }
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
874 #undef SUM
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
875
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
876 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
877 int tmp;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
878
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
879 assert( (((int)pix1) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
880 assert( (((int)pix2) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
881 assert((line_size &7) ==0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
882
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
883 #define SUM(in0, in1, out0, out1) \
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
884 "movq (%0),%%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
885 "movq (%1)," #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
886 "movq 8(%0),%%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
887 "movq 8(%1)," #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
888 "addl %3,%0\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
889 "addl %3,%1\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
890 "psubb " #out0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
891 "psubb " #out1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
892 "pxor %%mm7, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
893 "pxor %%mm7, %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
894 "movq %%mm2, " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
895 "movq %%mm3, " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
896 "psubusb " #in0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
897 "psubusb " #in1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
898 "psubusb " #out0 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
899 "psubusb " #out1 ", " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
900 "por %%mm2, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
901 "por %%mm3, " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
902 "movq " #in0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
903 "movq " #in1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
904 "punpcklbw %%mm7, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
905 "punpcklbw %%mm7, " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
906 "punpckhbw %%mm7, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
907 "punpckhbw %%mm7, %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
908 "paddw " #in1 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
909 "paddw %%mm3, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
910 "paddw %%mm2, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
911 "paddw " #in0 ", %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
912
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
913
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
914 asm volatile (
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
915 "movl %4,%%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
916 "pxor %%mm6,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
917 "pcmpeqw %%mm7,%%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
918 "psllw $15, %%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
919 "packsswb %%mm7, %%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
920 "movq (%0),%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
921 "movq (%1),%%mm2\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
922 "movq 8(%0),%%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
923 "movq 8(%1),%%mm3\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
924 "addl %3,%0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
925 "addl %3,%1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
926 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
927 "psubb %%mm2, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
928 "psubb %%mm3, %%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
929 "pxor %%mm7, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
930 "pxor %%mm7, %%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
931 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
932 "1:\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
933
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
934 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
935
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
936 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
937
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
938 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
939 "jnz 1b\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
940
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
941 "movq %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
942 "psrlq $32, %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
943 "paddw %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
944 "movq %%mm0,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
945 "psrlq $16, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
946 "paddw %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
947 "movd %%mm0,%2\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
948 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
949 : "r" (line_size) , "m" (h)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
950 : "%ecx");
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
951 return tmp & 0x7FFF;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
952 }
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
953 #undef SUM
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
954
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
955 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
956 int tmp;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
957
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
958 assert( (((int)pix1) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
959 assert( (((int)pix2) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
960 assert((line_size &7) ==0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
961
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
962 #define SUM(in0, in1, out0, out1) \
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
963 "movq (%0)," #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
964 "movq (%1),%%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
965 "movq 8(%0)," #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
966 "movq 8(%1),%%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
967 "addl %3,%0\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
968 "addl %3,%1\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
969 "psubb %%mm2, " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
970 "psubb %%mm3, " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
971 "pxor %%mm7, " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
972 "pxor %%mm7, " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
973 "psadbw " #out0 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
974 "psadbw " #out1 ", " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
975 "paddw " #in1 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
976 "paddw " #in0 ", %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
977
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
978 asm volatile (
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
979 "movl %4,%%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
980 "pxor %%mm6,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
981 "pcmpeqw %%mm7,%%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
982 "psllw $15, %%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
983 "packsswb %%mm7, %%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
984 "movq (%0),%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
985 "movq (%1),%%mm2\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
986 "movq 8(%0),%%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
987 "movq 8(%1),%%mm3\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
988 "addl %3,%0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
989 "addl %3,%1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
990 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
991 "psubb %%mm2, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
992 "psubb %%mm3, %%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
993 "pxor %%mm7, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
994 "pxor %%mm7, %%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
995 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
996 "1:\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
997
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
998 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
999
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1000 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1001
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1002 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1003 "jnz 1b\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1004
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1005 "movd %%mm6,%2\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1006 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1007 : "r" (line_size) , "m" (h)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1008 : "%ecx");
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1009 return tmp;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1010 }
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1011 #undef SUM
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1012
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1013 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1014 int i=0;
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1015 asm volatile(
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1016 "1: \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1017 "movq (%2, %0), %%mm0 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1018 "movq (%1, %0), %%mm1 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1019 "psubb %%mm0, %%mm1 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1020 "movq %%mm1, (%3, %0) \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1021 "movq 8(%2, %0), %%mm0 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1022 "movq 8(%1, %0), %%mm1 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1023 "psubb %%mm0, %%mm1 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1024 "movq %%mm1, 8(%3, %0) \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1025 "addl $16, %0 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1026 "cmpl %4, %0 \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1027 " jb 1b \n\t"
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1028 : "+r" (i)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1029 : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1030 );
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1031 for(; i<w; i++)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1032 dst[i+0] = src1[i+0]-src2[i+0];
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1033 }
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1034
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1035 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1036 int i=0;
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1037 uint8_t l, lt;
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1038
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1039 asm volatile(
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1040 "1: \n\t"
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1041 "movq -1(%1, %0), %%mm0 \n\t" // LT
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1042 "movq (%1, %0), %%mm1 \n\t" // T
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1043 "movq -1(%2, %0), %%mm2 \n\t" // L
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1044 "movq (%2, %0), %%mm3 \n\t" // X
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1045 "movq %%mm2, %%mm4 \n\t" // L
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1046 "psubb %%mm0, %%mm2 \n\t"
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1047 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1048 "movq %%mm4, %%mm5 \n\t" // L
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1049 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1050 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1051 "pminub %%mm2, %%mm4 \n\t"
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1052 "pmaxub %%mm1, %%mm4 \n\t"
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1053 "psubb %%mm4, %%mm3 \n\t" // dst - pred
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1054 "movq %%mm3, (%3, %0) \n\t"
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1055 "addl $8, %0 \n\t"
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1056 "cmpl %4, %0 \n\t"
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1057 " jb 1b \n\t"
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1058 : "+r" (i)
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1059 : "r"(src1), "r"(src2), "r"(dst), "r"(w)
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1060 );
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1061
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1062 l= *left;
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1063 lt= *left_top;
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1064
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1065 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1066
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1067 *left_top= src1[w-1];
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1068 *left = src2[w-1];
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1069 }
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1070
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1071 #define LBUTTERFLY2(a1,b1,a2,b2)\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1072 "paddw " #b1 ", " #a1 " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1073 "paddw " #b2 ", " #a2 " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1074 "paddw " #b1 ", " #b1 " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1075 "paddw " #b2 ", " #b2 " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1076 "psubw " #a1 ", " #b1 " \n\t"\
1186
4f0072371bb9 10l (hadamard fix)
michaelni
parents: 1153
diff changeset
1077 "psubw " #a2 ", " #b2 " \n\t"
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1078
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1079 #define HADAMARD48\
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1080 LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1081 LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1082 LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1083 LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1084 LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1085 LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1086
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1087 #define MMABS(a,z)\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1088 "pxor " #z ", " #z " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1089 "pcmpgtw " #a ", " #z " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1090 "pxor " #z ", " #a " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1091 "psubw " #z ", " #a " \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1092
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1093 #define MMABS_SUM(a,z, sum)\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1094 "pxor " #z ", " #z " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1095 "pcmpgtw " #a ", " #z " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1096 "pxor " #z ", " #a " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1097 "psubw " #z ", " #a " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1098 "paddusw " #a ", " #sum " \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1099
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1100 #define MMABS_MMX2(a,z)\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1101 "pxor " #z ", " #z " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1102 "psubw " #a ", " #z " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1103 "pmaxsw " #z ", " #a " \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1104
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1105 #define MMABS_SUM_MMX2(a,z, sum)\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1106 "pxor " #z ", " #z " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1107 "psubw " #a ", " #z " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1108 "pmaxsw " #z ", " #a " \n\t"\
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1109 "paddusw " #a ", " #sum " \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1110
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1111 #define SBUTTERFLY(a,b,t,n)\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1112 "movq " #a ", " #t " \n\t" /* abcd */\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1113 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1114 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1115
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1116 #define TRANSPOSE4(a,b,c,d,t)\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1117 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1118 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1119 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1120 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1121
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1122 #define LOAD4(o, a, b, c, d)\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1123 "movq "#o"(%1), " #a " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1124 "movq "#o"+16(%1), " #b " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1125 "movq "#o"+32(%1), " #c " \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1126 "movq "#o"+48(%1), " #d " \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1127
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1128 #define STORE4(o, a, b, c, d)\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1129 "movq "#a", "#o"(%1) \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1130 "movq "#b", "#o"+16(%1) \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1131 "movq "#c", "#o"+32(%1) \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1132 "movq "#d", "#o"+48(%1) \n\t"\
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1133
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
1134 static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1135 uint64_t temp[16] __align8;
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1136 int sum=0;
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
1137
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
1138 assert(h==8);
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1139
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1140 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1141
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1142 asm volatile(
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1143 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1144 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1145
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1146 HADAMARD48
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1147
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1148 "movq %%mm7, 112(%1) \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1149
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1150 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1151 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1152
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1153 "movq 112(%1), %%mm7 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1154 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1155 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1156
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1157 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1158 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1159
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1160 HADAMARD48
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1161
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1162 "movq %%mm7, 120(%1) \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1163
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1164 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1165 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1166
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1167 "movq 120(%1), %%mm7 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1168 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1169 "movq %%mm7, %%mm5 \n\t"//FIXME remove
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1170 "movq %%mm6, %%mm7 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1171 "movq %%mm0, %%mm6 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1172 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1173
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1174 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1175 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1176
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1177 HADAMARD48
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1178 "movq %%mm7, 64(%1) \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1179 MMABS(%%mm0, %%mm7)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1180 MMABS_SUM(%%mm1, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1181 MMABS_SUM(%%mm2, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1182 MMABS_SUM(%%mm3, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1183 MMABS_SUM(%%mm4, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1184 MMABS_SUM(%%mm5, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1185 MMABS_SUM(%%mm6, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1186 "movq 64(%1), %%mm1 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1187 MMABS_SUM(%%mm1, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1188 "movq %%mm0, 64(%1) \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1189
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1190 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1191 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1192
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1193 HADAMARD48
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1194 "movq %%mm7, (%1) \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1195 MMABS(%%mm0, %%mm7)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1196 MMABS_SUM(%%mm1, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1197 MMABS_SUM(%%mm2, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1198 MMABS_SUM(%%mm3, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1199 MMABS_SUM(%%mm4, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1200 MMABS_SUM(%%mm5, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1201 MMABS_SUM(%%mm6, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1202 "movq (%1), %%mm1 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1203 MMABS_SUM(%%mm1, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1204 "movq 64(%1), %%mm1 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1205 MMABS_SUM(%%mm1, %%mm7, %%mm0)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1206
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1207 "movq %%mm0, %%mm1 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1208 "psrlq $32, %%mm0 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1209 "paddusw %%mm1, %%mm0 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1210 "movq %%mm0, %%mm1 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1211 "psrlq $16, %%mm0 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1212 "paddusw %%mm1, %%mm0 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1213 "movd %%mm0, %0 \n\t"
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1214
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1215 : "=r" (sum)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1216 : "r"(temp)
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1217 );
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1218 return sum&0xFFFF;
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1219 }
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1220
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
1221 static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1222 uint64_t temp[16] __align8;
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1223 int sum=0;
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
1224
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
1225 assert(h==8);
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1226
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1227 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1228
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1229 asm volatile(
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1230 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1231 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1232
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1233 HADAMARD48
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1234
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1235 "movq %%mm7, 112(%1) \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1236
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1237 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1238 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1239
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1240 "movq 112(%1), %%mm7 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1241 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1242 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1243
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1244 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1245 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1246
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1247 HADAMARD48
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1248
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1249 "movq %%mm7, 120(%1) \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1250
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1251 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1252 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1253
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1254 "movq 120(%1), %%mm7 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1255 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1256 "movq %%mm7, %%mm5 \n\t"//FIXME remove
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1257 "movq %%mm6, %%mm7 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1258 "movq %%mm0, %%mm6 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1259 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1260
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1261 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1262 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1263
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1264 HADAMARD48
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1265 "movq %%mm7, 64(%1) \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1266 MMABS_MMX2(%%mm0, %%mm7)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1267 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1268 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1269 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1270 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1271 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1272 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1273 "movq 64(%1), %%mm1 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1274 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1275 "movq %%mm0, 64(%1) \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1276
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1277 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1278 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1279
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1280 HADAMARD48
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1281 "movq %%mm7, (%1) \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1282 MMABS_MMX2(%%mm0, %%mm7)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1283 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1284 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1285 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1286 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1287 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1288 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1289 "movq (%1), %%mm1 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1290 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1291 "movq 64(%1), %%mm1 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1292 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1293
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1294 "movq %%mm0, %%mm1 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1295 "psrlq $32, %%mm0 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1296 "paddusw %%mm1, %%mm0 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1297 "movq %%mm0, %%mm1 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1298 "psrlq $16, %%mm0 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1299 "paddusw %%mm1, %%mm0 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1300 "movd %%mm0, %0 \n\t"
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1301
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1302 : "=r" (sum)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1303 : "r"(temp)
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1304 );
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1305 return sum&0xFFFF;
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1306 }
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1307
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1308
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
1309 WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
1310 WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
1311 #endif //CONFIG_ENCODERS
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1312
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1313 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1314 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1315
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1316 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1317 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1318 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1319 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1320 "movq "#in7", " #m3 " \n\t" /* d */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1321 "movq "#in0", %%mm5 \n\t" /* D */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1322 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1323 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1324 "movq "#in1", %%mm5 \n\t" /* C */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1325 "movq "#in2", %%mm6 \n\t" /* B */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1326 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1327 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1328 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1329 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1330 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1331 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1332 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1333 "psraw $5, %%mm5 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1334 "packuswb %%mm5, %%mm5 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1335 OP(%%mm5, out, %%mm7, d)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1336
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1337 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
1338 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1339 uint64_t temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1340 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1341 asm volatile(\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1342 "pxor %%mm7, %%mm7 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1343 "1: \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1344 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1345 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1346 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1347 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1348 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1349 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1350 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1351 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1352 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1353 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1354 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1355 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1356 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1357 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1358 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1359 "paddw %%mm3, %%mm5 \n\t" /* b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1360 "paddw %%mm2, %%mm6 \n\t" /* c */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1361 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1362 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1363 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1364 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1365 "paddw %%mm4, %%mm0 \n\t" /* a */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1366 "paddw %%mm1, %%mm5 \n\t" /* d */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1367 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1368 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1369 "paddw %6, %%mm6 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1370 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1371 "psraw $5, %%mm0 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1372 "movq %%mm0, %5 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1373 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1374 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1375 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1376 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1377 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1378 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1379 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1380 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1381 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1382 "paddw %%mm0, %%mm2 \n\t" /* b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1383 "paddw %%mm5, %%mm3 \n\t" /* c */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1384 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1385 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1386 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1387 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1388 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1389 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1390 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1391 "paddw %%mm2, %%mm1 \n\t" /* a */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1392 "paddw %%mm6, %%mm4 \n\t" /* d */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1393 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1394 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1395 "paddw %6, %%mm1 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1396 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1397 "psraw $5, %%mm3 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1398 "movq %5, %%mm1 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1399 "packuswb %%mm3, %%mm1 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1400 OP_MMX2(%%mm1, (%1),%%mm4, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1401 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1402 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1403 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1404 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1405 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1406 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1407 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1408 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1409 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1410 "paddw %%mm1, %%mm5 \n\t" /* b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1411 "paddw %%mm4, %%mm0 \n\t" /* c */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1412 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1413 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1414 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1415 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1416 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1417 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1418 "paddw %%mm3, %%mm2 \n\t" /* d */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1419 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1420 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1421 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1422 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1423 "paddw %%mm2, %%mm6 \n\t" /* a */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1424 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1425 "paddw %6, %%mm0 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1426 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1427 "psraw $5, %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1428 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1429 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1430 "paddw %%mm5, %%mm3 \n\t" /* a */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1431 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1432 "paddw %%mm4, %%mm6 \n\t" /* b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1433 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1434 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1435 "paddw %%mm1, %%mm4 \n\t" /* c */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1436 "paddw %%mm2, %%mm5 \n\t" /* d */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1437 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1438 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1439 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1440 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1441 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1442 "paddw %6, %%mm4 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1443 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1444 "psraw $5, %%mm4 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1445 "packuswb %%mm4, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1446 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1447 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1448 "addl %3, %0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1449 "addl %4, %1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1450 "decl %2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1451 " jnz 1b \n\t"\
967
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
1452 : "+a"(src), "+c"(dst), "+m"(h)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1453 : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1454 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1455 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1456 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1457 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1458 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1459 int i;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1460 int16_t temp[16];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1461 /* quick HACK, XXX FIXME MUST be optimized */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1462 for(i=0; i<h; i++)\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1463 {\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1464 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1465 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1466 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1467 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1468 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1469 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1470 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1471 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1472 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1473 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1474 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1475 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1476 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1477 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1478 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1479 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1480 asm volatile(\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1481 "movq (%0), %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1482 "movq 8(%0), %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1483 "paddw %2, %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1484 "paddw %2, %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1485 "psraw $5, %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1486 "psraw $5, %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1487 "packuswb %%mm1, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1488 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1489 "movq 16(%0), %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1490 "movq 24(%0), %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1491 "paddw %2, %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1492 "paddw %2, %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1493 "psraw $5, %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1494 "psraw $5, %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1495 "packuswb %%mm1, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1496 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1497 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1498 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1499 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1500 dst+=dstStride;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1501 src+=srcStride;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1502 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1503 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1504 \
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
1505 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1506 uint64_t temp;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1507 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1508 asm volatile(\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1509 "pxor %%mm7, %%mm7 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1510 "1: \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1511 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1512 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1513 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1514 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1515 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1516 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1517 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1518 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1519 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1520 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1521 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1522 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1523 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1524 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1525 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1526 "paddw %%mm3, %%mm5 \n\t" /* b */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1527 "paddw %%mm2, %%mm6 \n\t" /* c */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1528 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1529 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1530 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1531 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1532 "paddw %%mm4, %%mm0 \n\t" /* a */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1533 "paddw %%mm1, %%mm5 \n\t" /* d */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1534 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1535 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1536 "paddw %6, %%mm6 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1537 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1538 "psraw $5, %%mm0 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1539 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1540 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1541 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1542 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1543 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1544 "paddw %%mm5, %%mm1 \n\t" /* a */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1545 "paddw %%mm6, %%mm2 \n\t" /* b */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1546 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1547 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1548 "paddw %%mm6, %%mm3 \n\t" /* c */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1549 "paddw %%mm5, %%mm4 \n\t" /* d */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1550 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1551 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1552 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1553 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1554 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1555 "paddw %6, %%mm1 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1556 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1557 "psraw $5, %%mm3 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1558 "packuswb %%mm3, %%mm0 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1559 OP_MMX2(%%mm0, (%1), %%mm4, q)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1560 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1561 "addl %3, %0 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1562 "addl %4, %1 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1563 "decl %2 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1564 " jnz 1b \n\t"\
967
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
1565 : "+a"(src), "+c"(dst), "+m"(h)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1566 : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1567 : "memory"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1568 );\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1569 }\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1570 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1571 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1572 int i;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1573 int16_t temp[8];\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1574 /* quick HACK, XXX FIXME MUST be optimized */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1575 for(i=0; i<h; i++)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1576 {\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1577 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1578 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1579 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1580 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1581 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1582 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1583 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1584 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1585 asm volatile(\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1586 "movq (%0), %%mm0 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1587 "movq 8(%0), %%mm1 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1588 "paddw %2, %%mm0 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1589 "paddw %2, %%mm1 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1590 "psraw $5, %%mm0 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1591 "psraw $5, %%mm1 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1592 "packuswb %%mm1, %%mm0 \n\t"\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1593 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1594 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1595 :"memory"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1596 );\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1597 dst+=dstStride;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1598 src+=srcStride;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1599 }\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1600 }
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1601
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1602 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1603 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1604 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1605 uint64_t temp[17*4];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1606 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1607 int count= 17;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1608 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1609 /*FIXME unroll */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1610 asm volatile(\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1611 "pxor %%mm7, %%mm7 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1612 "1: \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1613 "movq (%0), %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1614 "movq (%0), %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1615 "movq 8(%0), %%mm2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1616 "movq 8(%0), %%mm3 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1617 "punpcklbw %%mm7, %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1618 "punpckhbw %%mm7, %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1619 "punpcklbw %%mm7, %%mm2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1620 "punpckhbw %%mm7, %%mm3 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1621 "movq %%mm0, (%1) \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1622 "movq %%mm1, 17*8(%1) \n\t"\
967
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
1623 "movq %%mm2, 2*17*8(%1) \n\t"\
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
1624 "movq %%mm3, 3*17*8(%1) \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1625 "addl $8, %1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1626 "addl %3, %0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1627 "decl %2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1628 " jnz 1b \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1629 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
967
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
1630 : "r" (srcStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1631 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1632 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1633 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1634 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1635 count=4;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1636 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1637 /*FIXME reorder for speed */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1638 asm volatile(\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1639 /*"pxor %%mm7, %%mm7 \n\t"*/\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1640 "1: \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1641 "movq (%0), %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1642 "movq 8(%0), %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1643 "movq 16(%0), %%mm2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1644 "movq 24(%0), %%mm3 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1645 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1646 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1647 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1648 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1649 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1650 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1651 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1652 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1653 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1654 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1655 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1656 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1657 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1658 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1659 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1660 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1661 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1662 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1663 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1664 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1665 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1666 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1667 "addl %4, %1 \n\t" \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1668 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1669 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1670 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1671 "addl $136, %0 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1672 "addl %6, %1 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1673 "decl %2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1674 " jnz 1b \n\t"\
958
9bb668034ecf slowdown / gcc 2.95.* bug workaround (this should be reversed as soon as gcc 2.95.* support is droped)
michaelni
parents: 954
diff changeset
1675 \
967
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
1676 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1677 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1678 :"memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1679 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1680 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1681 \
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
1682 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1683 uint64_t temp[9*4];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1684 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1685 int count= 9;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1686 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1687 /*FIXME unroll */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1688 asm volatile(\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1689 "pxor %%mm7, %%mm7 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1690 "1: \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1691 "movq (%0), %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1692 "movq (%0), %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1693 "punpcklbw %%mm7, %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1694 "punpckhbw %%mm7, %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1695 "movq %%mm0, (%1) \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1696 "movq %%mm1, 9*8(%1) \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1697 "addl $8, %1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1698 "addl %3, %0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1699 "decl %2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1700 " jnz 1b \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1701 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1702 : "r" (srcStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1703 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1704 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1705 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1706 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1707 count=2;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1708 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1709 /*FIXME reorder for speed */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1710 asm volatile(\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1711 /*"pxor %%mm7, %%mm7 \n\t"*/\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1712 "1: \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1713 "movq (%0), %%mm0 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1714 "movq 8(%0), %%mm1 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1715 "movq 16(%0), %%mm2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1716 "movq 24(%0), %%mm3 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1717 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1718 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1719 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1720 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1721 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1722 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1723 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1724 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1725 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1726 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1727 "addl %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1728 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1729 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1730 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1731 "addl $72, %0 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1732 "addl %6, %1 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1733 "decl %2 \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1734 " jnz 1b \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1735 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1736 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1737 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1738 : "memory"\
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1739 );\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1740 }\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1741 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1742 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1743 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1744 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1745 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1746 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1747 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1748 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1749 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1750 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1751 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1752 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1753 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1754 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1755 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1756 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1757 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1758 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1759 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1760 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1761 OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1762 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1763 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1764 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1765 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1766 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1767 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1768 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1769 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1770 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1771 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1772 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1773 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1774 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1775 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1776 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1777 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1778 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1779 OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1780 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1781 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1782 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1783 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1784 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1785 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1786 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1787 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1788 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1789 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1790 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1791 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1792 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1793 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1794 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1795 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1796 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1797 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1798 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1799 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1800 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1801 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1802 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1803 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1804 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1805 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1806 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1807 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1808 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1809 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1810 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1811 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1812 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1813 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1814 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1815 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1816 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1817 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1818 uint64_t half[8 + 9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1819 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1820 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1821 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1822 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1823 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1824 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1825 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1826 uint64_t half[8 + 9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1827 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1828 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1829 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1830 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1831 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1832 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1833 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1834 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1835 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1836 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1837 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1838 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1839 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1840 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1841 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1842 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1843 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1844 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1845 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1846 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1847 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1848 uint64_t half[9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1849 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1850 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1851 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1852 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1853 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1854 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1855 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1856 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1857 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1858 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1859 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1860 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1861 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1862 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1863 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1864 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1865 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1866 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1867 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1868 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1869 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1870 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1871 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1872 OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1873 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1874 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1875 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1876 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1877 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1878 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1879 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1880 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1881 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1882 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1883 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1884 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1885 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1886 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1887 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1888 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1889 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1890 OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1891 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1892 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1893 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1894 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1895 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1896 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1897 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1898 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1899 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1900 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1901 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1902 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1903 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1904 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1905 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1906 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1907 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1908 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1909 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1910 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1911 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1912 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1913 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1914 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1915 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1916 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1917 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1918 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1919 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1920 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1921 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1922 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1923 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1924 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1925 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1926 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1927 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1928 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1929 uint64_t half[16*2 + 17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1930 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1931 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1932 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1933 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1934 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1935 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1936 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1937 uint64_t half[16*2 + 17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1938 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1939 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1940 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1941 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1942 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1943 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1944 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1945 uint64_t half[17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1946 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1947 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1948 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1949 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1950 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1951 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1952 uint64_t half[17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1953 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1954 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1955 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1956 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1957 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1958 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1959 uint64_t half[17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1960 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1961 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1962 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1963 }
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1964
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1965
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1966 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1967 #define AVG_3DNOW_OP(a,b,temp, size) \
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1968 "mov" #size " " #b ", " #temp " \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1969 "pavgusb " #temp ", " #a " \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1970 "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1971 #define AVG_MMX2_OP(a,b,temp, size) \
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1972 "mov" #size " " #b ", " #temp " \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1973 "pavgb " #temp ", " #a " \n\t"\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1974 "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1975
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1976 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1977 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1978 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1979 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1980 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1981 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1982 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1983 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1984 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1985
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
1986 #if 0
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
1987 static void just_return() { return; }
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
1988 #endif
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
1989
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1990 #define SET_QPEL_FUNC(postfix1, postfix2) \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1991 c->put_ ## postfix1 = put_ ## postfix2;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1992 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1993 c->avg_ ## postfix1 = avg_ ## postfix2;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1994
1784
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
1995 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
1996 int i=0;
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
1997
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
1998 assert(ABS(scale) < 256);
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
1999 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2000
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2001 asm volatile(
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2002 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2003 "psrlw $15, %%mm6 \n\t" // 1w
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2004 "pxor %%mm7, %%mm7 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2005 "movd %4, %%mm5 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2006 "punpcklwd %%mm5, %%mm5 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2007 "punpcklwd %%mm5, %%mm5 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2008 "1: \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2009 "movq (%1, %0), %%mm0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2010 "movq 8(%1, %0), %%mm1 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2011 "pmulhw %%mm5, %%mm0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2012 "pmulhw %%mm5, %%mm1 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2013 "paddw %%mm6, %%mm0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2014 "paddw %%mm6, %%mm1 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2015 "psraw $1, %%mm0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2016 "psraw $1, %%mm1 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2017 "paddw (%2, %0), %%mm0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2018 "paddw 8(%2, %0), %%mm1 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2019 "psraw $6, %%mm0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2020 "psraw $6, %%mm1 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2021 "pmullw (%3, %0), %%mm0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2022 "pmullw 8(%3, %0), %%mm1 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2023 "pmaddwd %%mm0, %%mm0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2024 "pmaddwd %%mm1, %%mm1 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2025 "paddd %%mm1, %%mm0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2026 "psrld $4, %%mm0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2027 "paddd %%mm0, %%mm7 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2028 "addl $16, %0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2029 "cmpl $128, %0 \n\t" //FIXME optimize & bench
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2030 " jb 1b \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2031 "movq %%mm7, %%mm6 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2032 "psrlq $32, %%mm7 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2033 "paddd %%mm6, %%mm7 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2034 "psrld $2, %%mm7 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2035 "movd %%mm7, %0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2036
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2037 : "+r" (i)
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2038 : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2039 );
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2040 return i;
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2041 }
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2042
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2043 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2044 int i=0;
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2045
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2046 if(ABS(scale) < 256){
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2047 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2048 asm volatile(
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2049 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2050 "psrlw $15, %%mm6 \n\t" // 1w
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2051 "movd %3, %%mm5 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2052 "punpcklwd %%mm5, %%mm5 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2053 "punpcklwd %%mm5, %%mm5 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2054 "1: \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2055 "movq (%1, %0), %%mm0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2056 "movq 8(%1, %0), %%mm1 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2057 "pmulhw %%mm5, %%mm0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2058 "pmulhw %%mm5, %%mm1 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2059 "paddw %%mm6, %%mm0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2060 "paddw %%mm6, %%mm1 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2061 "psraw $1, %%mm0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2062 "psraw $1, %%mm1 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2063 "paddw (%2, %0), %%mm0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2064 "paddw 8(%2, %0), %%mm1 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2065 "movq %%mm0, (%2, %0) \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2066 "movq %%mm1, 8(%2, %0) \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2067 "addl $16, %0 \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2068 "cmpl $128, %0 \n\t" //FIXME optimize & bench
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2069 " jb 1b \n\t"
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2070
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2071 : "+r" (i)
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2072 : "r"(basis), "r"(rem), "g"(scale)
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2073 );
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2074 }else{
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2075 for(i=0; i<8*8; i++){
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2076 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2077 }
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2078 }
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2079 }
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2080
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2081 /* external functions, from idct_mmx.c */
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2082 void ff_mmx_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2083 void ff_mmxext_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2084
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2085 /* XXX: those functions should be suppressed ASAP when all IDCTs are
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2086 converted */
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2087 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2088 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2089 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2090 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2091 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2092 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2093 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2094 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2095 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2096 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2097 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2098 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2099 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2100 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2101 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2102 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2103 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2104 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2105 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2106 }
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2107
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2108 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2109 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
2110 mm_flags = mm_support();
1115
74a46d77e061 * support FF_MM_FORCE
kabi
parents: 1092
diff changeset
2111
1122
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
2112 if (avctx->dsp_mask) {
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
2113 if (avctx->dsp_mask & FF_MM_FORCE)
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
2114 mm_flags |= (avctx->dsp_mask & 0xffff);
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
2115 else
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
2116 mm_flags &= ~(avctx->dsp_mask & 0xffff);
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
2117 }
1115
74a46d77e061 * support FF_MM_FORCE
kabi
parents: 1092
diff changeset
2118
631
47a8964ba5cd be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents: 629
diff changeset
2119 #if 0
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2120 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2121 if (mm_flags & MM_MMX)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2122 av_log(avctx, AV_LOG_INFO, " mmx");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2123 if (mm_flags & MM_MMXEXT)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2124 av_log(avctx, AV_LOG_INFO, " mmxext");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2125 if (mm_flags & MM_3DNOW)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2126 av_log(avctx, AV_LOG_INFO, " 3dnow");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2127 if (mm_flags & MM_SSE)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2128 av_log(avctx, AV_LOG_INFO, " sse");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2129 if (mm_flags & MM_SSE2)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2130 av_log(avctx, AV_LOG_INFO, " sse2");
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2131 av_log(avctx, AV_LOG_INFO, "\n");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2132 #endif
986e461dc072 Initial revision
glantau
parents:
diff changeset
2133
986e461dc072 Initial revision
glantau
parents:
diff changeset
2134 if (mm_flags & MM_MMX) {
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2135 const int dct_algo = avctx->dct_algo;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2136 const int idct_algo= avctx->idct_algo;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2137
1232
e88d3b1fb2a1 more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents: 1186
diff changeset
2138 #ifdef CONFIG_ENCODERS
1565
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
2139 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1765
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
2140 if(mm_flags & MM_SSE2){
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
2141 c->fdct = ff_fdct_sse2;
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
2142 }else if(mm_flags & MM_MMXEXT){
1565
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
2143 c->fdct = ff_fdct_mmx2;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
2144 }else{
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
2145 c->fdct = ff_fdct_mmx;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
2146 }
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
2147 }
1232
e88d3b1fb2a1 more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents: 1186
diff changeset
2148 #endif //CONFIG_ENCODERS
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2149
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2150 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2151 c->idct_put= ff_simple_idct_put_mmx;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2152 c->idct_add= ff_simple_idct_add_mmx;
1324
7d328fd9d8a5 the return of the idct with 16bit output by ("Ivan Kalvachev" <ivan at cacad dot com>)
michaelni
parents: 1232
diff changeset
2153 c->idct = ff_simple_idct_mmx;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2154 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2155 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2156 if(mm_flags & MM_MMXEXT){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2157 c->idct_put= ff_libmpeg2mmx2_idct_put;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2158 c->idct_add= ff_libmpeg2mmx2_idct_add;
1324
7d328fd9d8a5 the return of the idct with 16bit output by ("Ivan Kalvachev" <ivan at cacad dot com>)
michaelni
parents: 1232
diff changeset
2159 c->idct = ff_mmxext_idct;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2160 }else{
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2161 c->idct_put= ff_libmpeg2mmx_idct_put;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2162 c->idct_add= ff_libmpeg2mmx_idct_add;
1324
7d328fd9d8a5 the return of the idct with 16bit output by ("Ivan Kalvachev" <ivan at cacad dot com>)
michaelni
parents: 1232
diff changeset
2163 c->idct = ff_mmx_idct;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2164 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2165 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2166 }
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2167
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2168 /* VP3 optimized DSP functions */
1972
8a556283601d hook up support for SSE2-optimized VP3 IDCT
melanson
parents: 1868
diff changeset
2169 if (mm_flags & MM_SSE2) {
8a556283601d hook up support for SSE2-optimized VP3 IDCT
melanson
parents: 1868
diff changeset
2170 c->vp3_dsp_init = vp3_dsp_init_sse2;
1977
89422281f6f6 reorganize and simplify the VP3 IDCT stuff
melanson
parents: 1972
diff changeset
2171 c->vp3_idct = vp3_idct_sse2;
1972
8a556283601d hook up support for SSE2-optimized VP3 IDCT
melanson
parents: 1868
diff changeset
2172 } else {
8a556283601d hook up support for SSE2-optimized VP3 IDCT
melanson
parents: 1868
diff changeset
2173 c->vp3_dsp_init = vp3_dsp_init_mmx;
1977
89422281f6f6 reorganize and simplify the VP3 IDCT stuff
melanson
parents: 1972
diff changeset
2174 c->vp3_idct = vp3_idct_mmx;
1972
8a556283601d hook up support for SSE2-optimized VP3 IDCT
melanson
parents: 1868
diff changeset
2175 }
1977
89422281f6f6 reorganize and simplify the VP3 IDCT stuff
melanson
parents: 1972
diff changeset
2176
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
2177 #ifdef CONFIG_ENCODERS
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2178 c->get_pixels = get_pixels_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2179 c->diff_pixels = diff_pixels_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
2180 #endif //CONFIG_ENCODERS
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2181 c->put_pixels_clamped = put_pixels_clamped_mmx;
1984
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
2182 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2183 c->add_pixels_clamped = add_pixels_clamped_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2184 c->clear_blocks = clear_blocks_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
2185 #ifdef CONFIG_ENCODERS
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2186 c->pix_sum = pix_sum16_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
2187 #endif //CONFIG_ENCODERS
415
1c3f42442fba * added simple test main - see comments about how to
kabi
parents: 402
diff changeset
2188
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2189 c->put_pixels_tab[0][0] = put_pixels16_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2190 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2191 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2192 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2193
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2194 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2195 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2196 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2197 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
2198
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2199 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2200 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2201 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2202 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
415
1c3f42442fba * added simple test main - see comments about how to
kabi
parents: 402
diff changeset
2203
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2204 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2205 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2206 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2207 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2208
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2209 c->put_pixels_tab[1][0] = put_pixels8_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2210 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2211 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2212 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2213
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2214 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2215 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2216 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2217 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
2218
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2219 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2220 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2221 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2222 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
2223
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2224 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2225 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2226 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2227 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2228
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
2229 c->add_bytes= add_bytes_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
2230 #ifdef CONFIG_ENCODERS
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
2231 c->diff_bytes= diff_bytes_mmx;
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
2232
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
2233 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
2234 c->hadamard8_diff[1]= hadamard8_diff_mmx;
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
2235
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
2236 c->pix_norm1 = pix_norm1_mmx;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
2237 c->sse[0] = sse16_mmx;
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
2238 c->vsad[4]= vsad_intra16_mmx;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
2239
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
2240 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
2241 c->vsad[0] = vsad16_mmx;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
2242 }
1784
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2243
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2244 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2245 c->try_8x8basis= try_8x8basis_mmx;
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2246 }
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2247 c->add_8x8basis= add_8x8basis_mmx;
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2248
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
2249 #endif //CONFIG_ENCODERS
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
2250
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
2251 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
1784
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
2252 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
2253
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2254 if (mm_flags & MM_MMXEXT) {
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2255 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2256 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
2257
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2258 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2259 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2260 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
415
1c3f42442fba * added simple test main - see comments about how to
kabi
parents: 402
diff changeset
2261
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2262 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2263 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
2264
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2265 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2266 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2267 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2268
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
2269 #ifdef CONFIG_ENCODERS
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
2270 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
2271 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
2272 c->vsad[4]= vsad_intra16_mmx2;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
2273 #endif //CONFIG_ENCODERS
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
2274
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2275 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2276 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2277 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2278 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2279 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2280 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2281 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
1772
8cd5257195c9 vsad16_mmx2 only applies if encoders are turned on
melanson
parents: 1765
diff changeset
2282 #ifdef CONFIG_ENCODERS
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
2283 c->vsad[0] = vsad16_mmx2;
1772
8cd5257195c9 vsad16_mmx2 only applies if encoders are turned on
melanson
parents: 1765
diff changeset
2284 #endif //CONFIG_ENCODERS
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2285 }
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2286
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2287 #if 1
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2288 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2289 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2290 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2291 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2292 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2293 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2294 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2295 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2296 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2297 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2298 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2299 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2300 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2301 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2302 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2303 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2304 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2305 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2306 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2307 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2308 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2309 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2310 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2311 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2312 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2313 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2314 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2315 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2316 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2317 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2318 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2319 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2320 #endif
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
2321
1686
68abbec33289 Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents: 1648
diff changeset
2322 #ifdef CONFIG_ENCODERS
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
2323 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1686
68abbec33289 Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents: 1648
diff changeset
2324 #endif //CONFIG_ENCODERS
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2325 } else if (mm_flags & MM_3DNOW) {
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2326 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2327 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
2328
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2329 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2330 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2331 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
2332
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2333 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2334 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2335
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2336 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2337 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2338 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2339
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2340 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2341 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2342 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2343 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2344 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2345 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2346 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2347 }
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2348
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2349 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2350 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2351 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2352 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2353 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2354 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2355 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2356 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2357 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2358 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2359 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2360 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2361 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2362 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2363 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2364 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2365 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2366 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2367 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2368 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2369 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2370 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2371 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2372 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2373 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2374 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2375 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2376 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2377 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2378 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2379 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2380 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2381 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
2382 }
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2383
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
2384 #ifdef CONFIG_ENCODERS
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2385 dsputil_init_pix_mmx(c, avctx);
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
2386 #endif //CONFIG_ENCODERS
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2387 #if 0
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2388 // for speed testing
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2389 get_pixels = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2390 put_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2391 add_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2392
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2393 pix_abs16x16 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2394 pix_abs16x16_x2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2395 pix_abs16x16_y2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2396 pix_abs16x16_xy2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2397
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2398 put_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2399 put_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2400 put_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2401 put_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2402
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2403 put_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2404 put_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2405 put_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2406 put_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2407
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2408 avg_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2409 avg_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2410 avg_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2411 avg_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2412
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2413 avg_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2414 avg_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2415 avg_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2416 avg_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2417
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2418 //av_fdct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2419 //ff_idct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2420 #endif
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2421 }