annotate i386/h264dsp_mmx.c @ 8423:b55ec18fe5e9 libavcodec

Use ARCH_X86_32 instead of !ARCH_X86_64, it is more straightforward.
author diego
date Mon, 22 Dec 2008 00:03:30 +0000
parents de2509cf3c44
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1 /*
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
3 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3645
diff changeset
4 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3645
diff changeset
5 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3645
diff changeset
6 * FFmpeg is free software; you can redistribute it and/or
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
8 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3645
diff changeset
9 * version 2.1 of the License, or (at your option) any later version.
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
10 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3645
diff changeset
11 * FFmpeg is distributed in the hope that it will be useful,
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
14 * Lesser General Public License for more details.
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
15 *
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3645
diff changeset
17 * License along with FFmpeg; if not, write to the Free Software
3036
0b546eab515d Update licensing information: The FSF changed postal address.
diego
parents: 3029
diff changeset
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
19 */
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
20
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5809
diff changeset
21 #include "dsputil_mmx.h"
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
22
7327
483421b11d98 Fix h264_loop_filter_strength_mmx2() so it works with PAFF.
michael
parents: 6755
diff changeset
23 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL;
483421b11d98 Fix h264_loop_filter_strength_mmx2() so it works with PAFF.
michael
parents: 6755
diff changeset
24 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL;
483421b11d98 Fix h264_loop_filter_strength_mmx2() so it works with PAFF.
michael
parents: 6755
diff changeset
25
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
26 /***********************************/
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
27 /* IDCT */
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
28
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
29 #define SUMSUB_BADC( a, b, c, d ) \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
30 "paddw "#b", "#a" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
31 "paddw "#d", "#c" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
32 "paddw "#b", "#b" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
33 "paddw "#d", "#d" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
34 "psubw "#a", "#b" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
35 "psubw "#c", "#d" \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
36
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
37 #define SUMSUBD2_AB( a, b, t ) \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
38 "movq "#b", "#t" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
39 "psraw $1 , "#b" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
40 "paddw "#a", "#b" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
41 "psraw $1 , "#a" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
42 "psubw "#t", "#a" \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
43
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
44 #define IDCT4_1D( s02, s13, d02, d13, t ) \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
45 SUMSUB_BA ( s02, d02 )\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
46 SUMSUBD2_AB( s13, d13, t )\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
47 SUMSUB_BADC( d13, s02, s13, d02 )
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
48
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
49 #define STORE_DIFF_4P( p, t, z ) \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
50 "psraw $6, "#p" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
51 "movd (%0), "#t" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
52 "punpcklbw "#z", "#t" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
53 "paddsw "#t", "#p" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
54 "packuswb "#z", "#p" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
55 "movd "#p", (%0) \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
56
3173
9a2cc7b0fbdb h264_idct_add only needs mmx1
lorenm
parents: 3165
diff changeset
57 static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
58 {
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
59 /* Load dct coeffs */
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
60 __asm__ volatile(
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
61 "movq (%0), %%mm0 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
62 "movq 8(%0), %%mm1 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
63 "movq 16(%0), %%mm2 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
64 "movq 24(%0), %%mm3 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
65 :: "r"(block) );
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
66
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
67 __asm__ volatile(
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
68 /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
69 IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
70
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
71 "movq %0, %%mm6 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
72 /* in: 1,4,0,2 out: 1,2,3,0 */
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
73 TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
74
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
75 "paddw %%mm6, %%mm3 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
76
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
77 /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
78 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
79
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
80 "pxor %%mm7, %%mm7 \n\t"
6329
5969caa9190d clean up an ugliness introduced in r11826. this syntax will require fewer changes when adding future sse2 code.
lorenm
parents: 6321
diff changeset
81 :: "m"(ff_pw_32));
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
82
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
83 __asm__ volatile(
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
84 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
85 "add %1, %0 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
86 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
87 "add %1, %0 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
88 STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
89 "add %1, %0 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
90 STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
91 : "+r"(dst)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
92 : "r" ((x86_reg)stride)
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
93 );
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
94 }
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
95
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
96 static inline void h264_idct8_1d(int16_t *block)
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
97 {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
98 __asm__ volatile(
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
99 "movq 112(%0), %%mm7 \n\t"
6319
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
100 "movq 80(%0), %%mm0 \n\t"
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
101 "movq 48(%0), %%mm3 \n\t"
6319
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
102 "movq 16(%0), %%mm5 \n\t"
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
103
6319
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
104 "movq %%mm0, %%mm4 \n\t"
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
105 "movq %%mm5, %%mm1 \n\t"
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
106 "psraw $1, %%mm4 \n\t"
6319
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
107 "psraw $1, %%mm1 \n\t"
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
108 "paddw %%mm0, %%mm4 \n\t"
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
109 "paddw %%mm5, %%mm1 \n\t"
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
110 "paddw %%mm7, %%mm4 \n\t"
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
111 "paddw %%mm0, %%mm1 \n\t"
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
112 "psubw %%mm5, %%mm4 \n\t"
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
113 "paddw %%mm3, %%mm1 \n\t"
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
114
6319
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
115 "psubw %%mm3, %%mm5 \n\t"
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
116 "psubw %%mm3, %%mm0 \n\t"
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
117 "paddw %%mm7, %%mm5 \n\t"
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
118 "psubw %%mm7, %%mm0 \n\t"
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
119 "psraw $1, %%mm3 \n\t"
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
120 "psraw $1, %%mm7 \n\t"
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
121 "psubw %%mm3, %%mm5 \n\t"
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
122 "psubw %%mm7, %%mm0 \n\t"
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
123
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
124 "movq %%mm4, %%mm3 \n\t"
6319
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
125 "movq %%mm1, %%mm7 \n\t"
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
126 "psraw $2, %%mm1 \n\t"
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
127 "psraw $2, %%mm3 \n\t"
6319
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
128 "paddw %%mm5, %%mm3 \n\t"
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
129 "psraw $2, %%mm5 \n\t"
6319
4089a1ae6558 remove some movq in ff_h264_idct8_add_mmx. 225->217 cycles on core2.
lorenm
parents: 6135
diff changeset
130 "paddw %%mm0, %%mm1 \n\t"
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
131 "psraw $2, %%mm0 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
132 "psubw %%mm4, %%mm5 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
133 "psubw %%mm0, %%mm7 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
134
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
135 "movq 32(%0), %%mm2 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
136 "movq 96(%0), %%mm6 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
137 "movq %%mm2, %%mm4 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
138 "movq %%mm6, %%mm0 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
139 "psraw $1, %%mm4 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
140 "psraw $1, %%mm6 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
141 "psubw %%mm0, %%mm4 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
142 "paddw %%mm2, %%mm6 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
143
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
144 "movq (%0), %%mm2 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
145 "movq 64(%0), %%mm0 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
146 SUMSUB_BA( %%mm0, %%mm2 )
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
147 SUMSUB_BA( %%mm6, %%mm0 )
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
148 SUMSUB_BA( %%mm4, %%mm2 )
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
149 SUMSUB_BA( %%mm7, %%mm6 )
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
150 SUMSUB_BA( %%mm5, %%mm4 )
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
151 SUMSUB_BA( %%mm3, %%mm2 )
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
152 SUMSUB_BA( %%mm1, %%mm0 )
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
153 :: "r"(block)
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
154 );
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
155 }
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
156
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
157 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
158 {
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
159 int i;
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
160 int16_t __attribute__ ((aligned(8))) b2[64];
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
161
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
162 block[0] += 32;
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
163
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
164 for(i=0; i<2; i++){
4137
6e5dcbdbfeba ensure alignment (no speed change)
michael
parents: 4136
diff changeset
165 DECLARE_ALIGNED_8(uint64_t, tmp);
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
166
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
167 h264_idct8_1d(block+4*i);
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
168
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
169 __asm__ volatile(
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
170 "movq %%mm7, %0 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
172 "movq %%mm0, 8(%1) \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
173 "movq %%mm6, 24(%1) \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
174 "movq %%mm7, 40(%1) \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
175 "movq %%mm4, 56(%1) \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
176 "movq %0, %%mm7 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
177 TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
178 "movq %%mm7, (%1) \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
179 "movq %%mm1, 16(%1) \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
180 "movq %%mm0, 32(%1) \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
181 "movq %%mm3, 48(%1) \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
182 : "=m"(tmp)
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
183 : "r"(b2+32*i)
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
184 : "memory"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
185 );
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
186 }
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
187
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
188 for(i=0; i<2; i++){
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
189 h264_idct8_1d(b2+4*i);
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
190
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
191 __asm__ volatile(
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
192 "psraw $6, %%mm7 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
193 "psraw $6, %%mm6 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
194 "psraw $6, %%mm5 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
195 "psraw $6, %%mm4 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
196 "psraw $6, %%mm3 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
197 "psraw $6, %%mm2 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
198 "psraw $6, %%mm1 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
199 "psraw $6, %%mm0 \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
200
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
201 "movq %%mm7, (%0) \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
202 "movq %%mm5, 16(%0) \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
203 "movq %%mm3, 32(%0) \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
204 "movq %%mm1, 48(%0) \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
205 "movq %%mm0, 64(%0) \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
206 "movq %%mm2, 80(%0) \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
207 "movq %%mm4, 96(%0) \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
208 "movq %%mm6, 112(%0) \n\t"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
209 :: "r"(b2+4*i)
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
210 : "memory"
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
211 );
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
212 }
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
213
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
214 add_pixels_clamped_mmx(b2, dst, stride);
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
215 }
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
216
6320
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
217 #define STORE_DIFF_8P( p, d, t, z )\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
218 "movq "#d", "#t" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
219 "psraw $6, "#p" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
220 "punpcklbw "#z", "#t" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
221 "paddsw "#t", "#p" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
222 "packuswb "#p", "#p" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
223 "movq "#p", "#d" \n"
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
224
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
225 #define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
226 "movdqa "#c", "#a" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
227 "movdqa "#g", "#e" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
228 "psraw $1, "#c" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
229 "psraw $1, "#g" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
230 "psubw "#e", "#c" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
231 "paddw "#a", "#g" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
232 "movdqa "#b", "#e" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
233 "psraw $1, "#e" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
234 "paddw "#b", "#e" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
235 "paddw "#d", "#e" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
236 "paddw "#f", "#e" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
237 "movdqa "#f", "#a" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
238 "psraw $1, "#a" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
239 "paddw "#f", "#a" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
240 "paddw "#h", "#a" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
241 "psubw "#b", "#a" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
242 "psubw "#d", "#b" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
243 "psubw "#d", "#f" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
244 "paddw "#h", "#b" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
245 "psubw "#h", "#f" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
246 "psraw $1, "#d" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
247 "psraw $1, "#h" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
248 "psubw "#d", "#b" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
249 "psubw "#h", "#f" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
250 "movdqa "#e", "#d" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
251 "movdqa "#a", "#h" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
252 "psraw $2, "#d" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
253 "psraw $2, "#h" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
254 "paddw "#f", "#d" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
255 "paddw "#b", "#h" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
256 "psraw $2, "#f" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
257 "psraw $2, "#b" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
258 "psubw "#f", "#e" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
259 "psubw "#a", "#b" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
260 "movdqa 0x00(%1), "#a" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
261 "movdqa 0x40(%1), "#f" \n"\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
262 SUMSUB_BA(f, a)\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
263 SUMSUB_BA(g, f)\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
264 SUMSUB_BA(c, a)\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
265 SUMSUB_BA(e, g)\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
266 SUMSUB_BA(b, c)\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
267 SUMSUB_BA(h, a)\
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
268 SUMSUB_BA(d, f)
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
269
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
270 static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
271 {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
272 __asm__ volatile(
6320
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
273 "movdqa 0x10(%1), %%xmm1 \n"
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
274 "movdqa 0x20(%1), %%xmm2 \n"
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
275 "movdqa 0x30(%1), %%xmm3 \n"
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
276 "movdqa 0x50(%1), %%xmm5 \n"
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
277 "movdqa 0x60(%1), %%xmm6 \n"
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
278 "movdqa 0x70(%1), %%xmm7 \n"
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
279 H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
280 TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
281 "paddw %4, %%xmm4 \n"
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
282 "movdqa %%xmm4, 0x00(%1) \n"
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
283 "movdqa %%xmm2, 0x40(%1) \n"
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
284 H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
285 "movdqa %%xmm6, 0x60(%1) \n"
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
286 "movdqa %%xmm7, 0x70(%1) \n"
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
287 "pxor %%xmm7, %%xmm7 \n"
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
288 STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7)
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
289 STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7)
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
290 STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
291 STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7)
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
292 "lea (%0,%2,4), %0 \n"
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
293 STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7)
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
294 STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7)
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
295 "movdqa 0x60(%1), %%xmm0 \n"
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
296 "movdqa 0x70(%1), %%xmm1 \n"
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
297 STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
298 STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7)
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
299 :"+r"(dst)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
300 :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
6320
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
301 );
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
302 }
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6319
diff changeset
303
3173
9a2cc7b0fbdb h264_idct_add only needs mmx1
lorenm
parents: 3165
diff changeset
304 static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
3105
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
305 {
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
306 int dc = (block[0] + 32) >> 6;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
307 __asm__ volatile(
3105
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
308 "movd %0, %%mm0 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
309 "pshufw $0, %%mm0, %%mm0 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
310 "pxor %%mm1, %%mm1 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
311 "psubw %%mm0, %%mm1 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
312 "packuswb %%mm0, %%mm0 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
313 "packuswb %%mm1, %%mm1 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
314 ::"r"(dc)
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
315 );
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
316 __asm__ volatile(
3105
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
317 "movd %0, %%mm2 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
318 "movd %1, %%mm3 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
319 "movd %2, %%mm4 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
320 "movd %3, %%mm5 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
321 "paddusb %%mm0, %%mm2 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
322 "paddusb %%mm0, %%mm3 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
323 "paddusb %%mm0, %%mm4 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
324 "paddusb %%mm0, %%mm5 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
325 "psubusb %%mm1, %%mm2 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
326 "psubusb %%mm1, %%mm3 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
327 "psubusb %%mm1, %%mm4 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
328 "psubusb %%mm1, %%mm5 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
329 "movd %%mm2, %0 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
330 "movd %%mm3, %1 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
331 "movd %%mm4, %2 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
332 "movd %%mm5, %3 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
333 :"+m"(*(uint32_t*)(dst+0*stride)),
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
334 "+m"(*(uint32_t*)(dst+1*stride)),
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
335 "+m"(*(uint32_t*)(dst+2*stride)),
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
336 "+m"(*(uint32_t*)(dst+3*stride))
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
337 );
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
338 }
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
339
3173
9a2cc7b0fbdb h264_idct_add only needs mmx1
lorenm
parents: 3165
diff changeset
340 static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
3105
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
341 {
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
342 int dc = (block[0] + 32) >> 6;
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
343 int y;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
344 __asm__ volatile(
3105
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
345 "movd %0, %%mm0 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
346 "pshufw $0, %%mm0, %%mm0 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
347 "pxor %%mm1, %%mm1 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
348 "psubw %%mm0, %%mm1 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
349 "packuswb %%mm0, %%mm0 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
350 "packuswb %%mm1, %%mm1 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
351 ::"r"(dc)
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
352 );
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
353 for(y=2; y--; dst += 4*stride){
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
354 __asm__ volatile(
3105
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
355 "movq %0, %%mm2 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
356 "movq %1, %%mm3 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
357 "movq %2, %%mm4 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
358 "movq %3, %%mm5 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
359 "paddusb %%mm0, %%mm2 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
360 "paddusb %%mm0, %%mm3 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
361 "paddusb %%mm0, %%mm4 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
362 "paddusb %%mm0, %%mm5 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
363 "psubusb %%mm1, %%mm2 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
364 "psubusb %%mm1, %%mm3 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
365 "psubusb %%mm1, %%mm4 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
366 "psubusb %%mm1, %%mm5 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
367 "movq %%mm2, %0 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
368 "movq %%mm3, %1 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
369 "movq %%mm4, %2 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
370 "movq %%mm5, %3 \n\t"
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
371 :"+m"(*(uint64_t*)(dst+0*stride)),
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
372 "+m"(*(uint64_t*)(dst+1*stride)),
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
373 "+m"(*(uint64_t*)(dst+2*stride)),
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
374 "+m"(*(uint64_t*)(dst+3*stride))
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
375 );
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
376 }
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
377 }
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3102
diff changeset
378
8375
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
379 //FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
380 static const uint8_t scan8[16 + 2*4]={
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
381 4+1*8, 5+1*8, 4+2*8, 5+2*8,
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
382 6+1*8, 7+1*8, 6+2*8, 7+2*8,
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
383 4+3*8, 5+3*8, 4+4*8, 5+4*8,
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
384 6+3*8, 7+3*8, 6+4*8, 7+4*8,
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
385 1+1*8, 2+1*8,
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
386 1+2*8, 2+2*8,
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
387 1+4*8, 2+4*8,
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
388 1+5*8, 2+5*8,
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
389 };
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
390
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
391 static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
392 int i;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
393 for(i=0; i<16; i++){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
394 if(nnzc[ scan8[i] ])
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
395 ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
396 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
397 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
398
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
399 static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
400 int i;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
401 for(i=0; i<16; i+=4){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
402 if(nnzc[ scan8[i] ])
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
403 ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride);
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
404 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
405 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
406
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
407
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
408 static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
409 int i;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
410 for(i=0; i<16; i++){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
411 int nnz = nnzc[ scan8[i] ];
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
412 if(nnz){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
413 if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
414 else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride);
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
415 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
416 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
417 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
418
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
419 static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
420 int i;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
421 for(i=0; i<16; i++){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
422 if(nnzc[ scan8[i] ] || block[i*16])
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
423 ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
424 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
425 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
426
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
427 static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
428 int i;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
429 for(i=0; i<16; i++){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
430 if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride);
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
431 else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
432 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
433 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
434
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
435 static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
436 int i;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
437 for(i=0; i<16; i+=4){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
438 int nnz = nnzc[ scan8[i] ];
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
439 if(nnz){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
440 if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
441 else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride);
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
442 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
443 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
444 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
445
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
446 static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
447 int i;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
448 for(i=0; i<16; i+=4){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
449 int nnz = nnzc[ scan8[i] ];
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
450 if(nnz){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
451 if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
452 else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride);
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
453 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
454 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
455 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
456
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
457 static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
458 int i;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
459 for(i=16; i<16+8; i++){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
460 if(nnzc[ scan8[i] ] || block[i*16])
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
461 ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
462 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
463 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
464
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
465 static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
466 int i;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
467 for(i=16; i<16+8; i++){
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
468 if(nnzc[ scan8[i] ])
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
469 ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
470 else if(block[i*16])
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
471 ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
472 }
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8031
diff changeset
473 }
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
474
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
475 /***********************************/
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
476 /* deblocking */
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
477
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
478 // out: o = |x-y|>a
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
479 // clobbers: t
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
480 #define DIFF_GT_MMX(x,y,a,o,t)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
481 "movq "#y", "#t" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
482 "movq "#x", "#o" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
483 "psubusb "#x", "#t" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
484 "psubusb "#y", "#o" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
485 "por "#t", "#o" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
486 "psubusb "#a", "#o" \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
487
4135
bbf0caa655f0 2 instructions less (same speed)
michael
parents: 4134
diff changeset
488 // out: o = |x-y|>a
bbf0caa655f0 2 instructions less (same speed)
michael
parents: 4134
diff changeset
489 // clobbers: t
bbf0caa655f0 2 instructions less (same speed)
michael
parents: 4134
diff changeset
490 #define DIFF_GT2_MMX(x,y,a,o,t)\
bbf0caa655f0 2 instructions less (same speed)
michael
parents: 4134
diff changeset
491 "movq "#y", "#t" \n\t"\
bbf0caa655f0 2 instructions less (same speed)
michael
parents: 4134
diff changeset
492 "movq "#x", "#o" \n\t"\
bbf0caa655f0 2 instructions less (same speed)
michael
parents: 4134
diff changeset
493 "psubusb "#x", "#t" \n\t"\
bbf0caa655f0 2 instructions less (same speed)
michael
parents: 4134
diff changeset
494 "psubusb "#y", "#o" \n\t"\
bbf0caa655f0 2 instructions less (same speed)
michael
parents: 4134
diff changeset
495 "psubusb "#a", "#t" \n\t"\
bbf0caa655f0 2 instructions less (same speed)
michael
parents: 4134
diff changeset
496 "psubusb "#a", "#o" \n\t"\
bbf0caa655f0 2 instructions less (same speed)
michael
parents: 4134
diff changeset
497 "pcmpeqb "#t", "#o" \n\t"\
bbf0caa655f0 2 instructions less (same speed)
michael
parents: 4134
diff changeset
498
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
499 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
500 // out: mm5=beta-1, mm7=mask
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
501 // clobbers: mm4,mm6
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
502 #define H264_DEBLOCK_MASK(alpha1, beta1) \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
503 "pshufw $0, "#alpha1", %%mm4 \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
504 "pshufw $0, "#beta1 ", %%mm5 \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
505 "packuswb %%mm4, %%mm4 \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
506 "packuswb %%mm5, %%mm5 \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
507 DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
508 DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
509 "por %%mm4, %%mm7 \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
510 DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
511 "por %%mm4, %%mm7 \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
512 "pxor %%mm6, %%mm6 \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
513 "pcmpeqb %%mm6, %%mm7 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
514
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
515 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
516 // out: mm1=p0' mm2=q0'
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
517 // clobbers: mm0,3-6
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
518 #define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
4127
d011a097bb85 optimize H264_DEBLOCK_P0_Q0
michael
parents: 3947
diff changeset
519 "movq %%mm1 , %%mm5 \n\t"\
d011a097bb85 optimize H264_DEBLOCK_P0_Q0
michael
parents: 3947
diff changeset
520 "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\
d011a097bb85 optimize H264_DEBLOCK_P0_Q0
michael
parents: 3947
diff changeset
521 "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\
d011a097bb85 optimize H264_DEBLOCK_P0_Q0
michael
parents: 3947
diff changeset
522 "pcmpeqb %%mm4 , %%mm4 \n\t"\
d011a097bb85 optimize H264_DEBLOCK_P0_Q0
michael
parents: 3947
diff changeset
523 "pxor %%mm4 , %%mm3 \n\t"\
d011a097bb85 optimize H264_DEBLOCK_P0_Q0
michael
parents: 3947
diff changeset
524 "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\
d011a097bb85 optimize H264_DEBLOCK_P0_Q0
michael
parents: 3947
diff changeset
525 "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\
d011a097bb85 optimize H264_DEBLOCK_P0_Q0
michael
parents: 3947
diff changeset
526 "pxor %%mm1 , %%mm4 \n\t"\
d011a097bb85 optimize H264_DEBLOCK_P0_Q0
michael
parents: 3947
diff changeset
527 "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\
d011a097bb85 optimize H264_DEBLOCK_P0_Q0
michael
parents: 3947
diff changeset
528 "pavgb %%mm5 , %%mm3 \n\t"\
4130
226f53a6fcc2 preempt possible overflow
michael
parents: 4129
diff changeset
529 "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\
4129
c5117bef5175 1 instruction less
michael
parents: 4127
diff changeset
530 "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
c5117bef5175 1 instruction less
michael
parents: 4127
diff changeset
531 "psubusb %%mm3 , %%mm6 \n\t"\
4127
d011a097bb85 optimize H264_DEBLOCK_P0_Q0
michael
parents: 3947
diff changeset
532 "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
4129
c5117bef5175 1 instruction less
michael
parents: 4127
diff changeset
533 "pminub %%mm7 , %%mm6 \n\t"\
4127
d011a097bb85 optimize H264_DEBLOCK_P0_Q0
michael
parents: 3947
diff changeset
534 "pminub %%mm7 , %%mm3 \n\t"\
4129
c5117bef5175 1 instruction less
michael
parents: 4127
diff changeset
535 "psubusb %%mm6 , %%mm1 \n\t"\
c5117bef5175 1 instruction less
michael
parents: 4127
diff changeset
536 "psubusb %%mm3 , %%mm2 \n\t"\
4127
d011a097bb85 optimize H264_DEBLOCK_P0_Q0
michael
parents: 3947
diff changeset
537 "paddusb %%mm3 , %%mm1 \n\t"\
4129
c5117bef5175 1 instruction less
michael
parents: 4127
diff changeset
538 "paddusb %%mm6 , %%mm2 \n\t"
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
539
5947
37a03989871b use ff_ prefix for extern vars
aurel
parents: 5946
diff changeset
540 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone
4866
6c66ddbb054f Fix parts missed in clip -> av_clip rename
reimar
parents: 4528
diff changeset
541 // out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
542 // clobbers: q2, tmp, tc0
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
543 #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
544 "movq %%mm1, "#tmp" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
545 "pavgb %%mm2, "#tmp" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
546 "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
547 "pxor "q2addr", "#tmp" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
548 "pand %8, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
549 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
550 "movq "#p1", "#tmp" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
551 "psubusb "#tc0", "#tmp" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
552 "paddusb "#p1", "#tc0" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
553 "pmaxub "#tmp", "#q2" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
554 "pminub "#tc0", "#q2" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
555 "movq "#q2", "q1addr" \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
556
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
557 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
558 {
4137
6e5dcbdbfeba ensure alignment (no speed change)
michael
parents: 4136
diff changeset
559 DECLARE_ALIGNED_8(uint64_t, tmp0[2]);
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
560
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
561 __asm__ volatile(
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
562 "movq (%1,%3), %%mm0 \n\t" //p1
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
563 "movq (%1,%3,2), %%mm1 \n\t" //p0
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
564 "movq (%2), %%mm2 \n\t" //q0
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
565 "movq (%2,%3), %%mm3 \n\t" //q1
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
566 H264_DEBLOCK_MASK(%6, %7)
4133
560ea2d5524e move luma tc0 related init into asm
michael
parents: 4131
diff changeset
567
560ea2d5524e move luma tc0 related init into asm
michael
parents: 4131
diff changeset
568 "movd %5, %%mm4 \n\t"
560ea2d5524e move luma tc0 related init into asm
michael
parents: 4131
diff changeset
569 "punpcklbw %%mm4, %%mm4 \n\t"
560ea2d5524e move luma tc0 related init into asm
michael
parents: 4131
diff changeset
570 "punpcklwd %%mm4, %%mm4 \n\t"
560ea2d5524e move luma tc0 related init into asm
michael
parents: 4131
diff changeset
571 "pcmpeqb %%mm3, %%mm3 \n\t"
560ea2d5524e move luma tc0 related init into asm
michael
parents: 4131
diff changeset
572 "movq %%mm4, %%mm6 \n\t"
560ea2d5524e move luma tc0 related init into asm
michael
parents: 4131
diff changeset
573 "pcmpgtb %%mm3, %%mm4 \n\t"
560ea2d5524e move luma tc0 related init into asm
michael
parents: 4131
diff changeset
574 "movq %%mm6, 8+%0 \n\t"
560ea2d5524e move luma tc0 related init into asm
michael
parents: 4131
diff changeset
575 "pand %%mm4, %%mm7 \n\t"
560ea2d5524e move luma tc0 related init into asm
michael
parents: 4131
diff changeset
576 "movq %%mm7, %0 \n\t"
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
577
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
578 /* filter p1 */
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
579 "movq (%1), %%mm3 \n\t" //p2
4135
bbf0caa655f0 2 instructions less (same speed)
michael
parents: 4134
diff changeset
580 DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
581 "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta
4136
05ffda007f80 merging mov & and (no speedchange)
michael
parents: 4135
diff changeset
582 "pand 8+%0, %%mm7 \n\t" // mask & tc0
05ffda007f80 merging mov & and (no speedchange)
michael
parents: 4135
diff changeset
583 "movq %%mm7, %%mm4 \n\t"
4131
1a8e384d0463 2 instructions less in h264_loop_filter_luma_mmx2()
michael
parents: 4130
diff changeset
584 "psubb %%mm6, %%mm7 \n\t"
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
585 "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
586 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
587
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
588 /* filter q1 */
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
589 "movq (%2,%3,2), %%mm4 \n\t" //q2
4135
bbf0caa655f0 2 instructions less (same speed)
michael
parents: 4134
diff changeset
590 DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
591 "pand %0, %%mm6 \n\t"
4134
3d2887b288f4 comment about failed optimization
michael
parents: 4133
diff changeset
592 "movq 8+%0, %%mm5 \n\t" // can be merged with the and below but is slower then
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
593 "pand %%mm6, %%mm5 \n\t"
4131
1a8e384d0463 2 instructions less in h264_loop_filter_luma_mmx2()
michael
parents: 4130
diff changeset
594 "psubb %%mm6, %%mm7 \n\t"
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
595 "movq (%2,%3), %%mm3 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
596 H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
597
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
598 /* filter p0, q0 */
4133
560ea2d5524e move luma tc0 related init into asm
michael
parents: 4131
diff changeset
599 H264_DEBLOCK_P0_Q0(%8, unused)
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
600 "movq %%mm1, (%1,%3,2) \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
601 "movq %%mm2, (%2) \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
602
4133
560ea2d5524e move luma tc0 related init into asm
michael
parents: 4131
diff changeset
603 : "=m"(*tmp0)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
604 : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
4133
560ea2d5524e move luma tc0 related init into asm
michael
parents: 4131
diff changeset
605 "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
5947
37a03989871b use ff_ prefix for extern vars
aurel
parents: 5946
diff changeset
606 "m"(ff_bone)
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
607 );
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
608 }
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
609
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
610 static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
611 {
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
612 if((tc0[0] & tc0[1]) >= 0)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
613 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
614 if((tc0[2] & tc0[3]) >= 0)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
615 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
616 }
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
617 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
618 {
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
619 //FIXME: could cut some load/stores by merging transpose with filter
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
620 // also, it only needs to transpose 6x8
4137
6e5dcbdbfeba ensure alignment (no speed change)
michael
parents: 4136
diff changeset
621 DECLARE_ALIGNED_8(uint8_t, trans[8*8]);
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
622 int i;
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
623 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
624 if((tc0[0] & tc0[1]) < 0)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
625 continue;
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
626 transpose4x4(trans, pix-4, 8, stride);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
627 transpose4x4(trans +4*8, pix, 8, stride);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
628 transpose4x4(trans+4, pix-4+4*stride, 8, stride);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
629 transpose4x4(trans+4+4*8, pix +4*stride, 8, stride);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
630 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
631 transpose4x4(pix-2, trans +2*8, stride, 8);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
632 transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
633 }
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
634 }
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
635
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
636 static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
637 {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
638 __asm__ volatile(
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
639 "movq (%0), %%mm0 \n\t" //p1
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
640 "movq (%0,%2), %%mm1 \n\t" //p0
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
641 "movq (%1), %%mm2 \n\t" //q0
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
642 "movq (%1,%2), %%mm3 \n\t" //q1
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
643 H264_DEBLOCK_MASK(%4, %5)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
644 "movd %3, %%mm6 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
645 "punpcklbw %%mm6, %%mm6 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
646 "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
647 H264_DEBLOCK_P0_Q0(%6, %7)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
648 "movq %%mm1, (%0,%2) \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
649 "movq %%mm2, (%1) \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
650
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
651 :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
652 "r"(*(uint32_t*)tc0),
5947
37a03989871b use ff_ prefix for extern vars
aurel
parents: 5946
diff changeset
653 "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
654 );
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
655 }
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
656
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
657 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
658 {
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
659 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
660 }
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
661
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
662 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
663 {
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
664 //FIXME: could cut some load/stores by merging transpose with filter
4137
6e5dcbdbfeba ensure alignment (no speed change)
michael
parents: 4136
diff changeset
665 DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
666 transpose4x4(trans, pix-2, 8, stride);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
667 transpose4x4(trans+4, pix-2+4*stride, 8, stride);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
668 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
669 transpose4x4(pix-2, trans, stride, 8);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
670 transpose4x4(pix-2+4*stride, trans+4, stride, 8);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
671 }
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
672
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
673 // p0 = (p0 + q1 + 2*p1 + 2) >> 2
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
674 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
675 "movq "#p0", %%mm4 \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
676 "pxor "#q1", %%mm4 \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
677 "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
678 "pavgb "#q1", "#p0" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
679 "psubusb %%mm4, "#p0" \n\t"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
680 "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
681
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
682 static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
683 {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
684 __asm__ volatile(
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
685 "movq (%0), %%mm0 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
686 "movq (%0,%2), %%mm1 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
687 "movq (%1), %%mm2 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
688 "movq (%1,%2), %%mm3 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
689 H264_DEBLOCK_MASK(%3, %4)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
690 "movq %%mm1, %%mm5 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
691 "movq %%mm2, %%mm6 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
692 H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
693 H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
694 "psubb %%mm5, %%mm1 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
695 "psubb %%mm6, %%mm2 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
696 "pand %%mm7, %%mm1 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
697 "pand %%mm7, %%mm2 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
698 "paddb %%mm5, %%mm1 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
699 "paddb %%mm6, %%mm2 \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
700 "movq %%mm1, (%0,%2) \n\t"
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
701 "movq %%mm2, (%1) \n\t"
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
702 :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
5947
37a03989871b use ff_ prefix for extern vars
aurel
parents: 5946
diff changeset
703 "m"(alpha1), "m"(beta1), "m"(ff_bone)
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
704 );
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
705 }
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
706
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
707 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
708 {
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
709 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
710 }
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
711
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
712 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
713 {
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
714 //FIXME: could cut some load/stores by merging transpose with filter
4137
6e5dcbdbfeba ensure alignment (no speed change)
michael
parents: 4136
diff changeset
715 DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
716 transpose4x4(trans, pix-2, 8, stride);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
717 transpose4x4(trans+4, pix-2+4*stride, 8, stride);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
718 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
719 transpose4x4(pix-2, trans, stride, 8);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
720 transpose4x4(pix-2+4*stride, trans+4, stride, 8);
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
721 }
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
722
3645
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
723 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
7327
483421b11d98 Fix h264_loop_filter_strength_mmx2() so it works with PAFF.
michael
parents: 6755
diff changeset
724 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
3645
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
725 int dir;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
726 __asm__ volatile(
3645
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
727 "pxor %%mm7, %%mm7 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
728 "movq %0, %%mm6 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
729 "movq %1, %%mm5 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
730 "movq %2, %%mm4 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
731 ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7)
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
732 );
7327
483421b11d98 Fix h264_loop_filter_strength_mmx2() so it works with PAFF.
michael
parents: 6755
diff changeset
733 if(field)
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
734 __asm__ volatile(
7327
483421b11d98 Fix h264_loop_filter_strength_mmx2() so it works with PAFF.
michael
parents: 6755
diff changeset
735 "movq %0, %%mm5 \n\t"
483421b11d98 Fix h264_loop_filter_strength_mmx2() so it works with PAFF.
michael
parents: 6755
diff changeset
736 "movq %1, %%mm4 \n\t"
483421b11d98 Fix h264_loop_filter_strength_mmx2() so it works with PAFF.
michael
parents: 6755
diff changeset
737 ::"m"(ff_pb_3_1), "m"(ff_pb_7_3)
483421b11d98 Fix h264_loop_filter_strength_mmx2() so it works with PAFF.
michael
parents: 6755
diff changeset
738 );
483421b11d98 Fix h264_loop_filter_strength_mmx2() so it works with PAFF.
michael
parents: 6755
diff changeset
739
3645
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
740 // could do a special case for dir==0 && edges==1, but it only reduces the
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
741 // average filter time by 1.2%
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
742 for( dir=1; dir>=0; dir-- ) {
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
743 const int d_idx = dir ? -8 : -1;
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
744 const int mask_mv = dir ? mask_mv1 : mask_mv0;
4137
6e5dcbdbfeba ensure alignment (no speed change)
michael
parents: 4136
diff changeset
745 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
3645
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
746 int b_idx, edge, l;
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
747 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
748 __asm__ volatile(
3645
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
749 "pand %0, %%mm0 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
750 ::"m"(mask_dir)
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
751 );
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
752 if(!(mask_mv & edge)) {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
753 __asm__ volatile("pxor %%mm0, %%mm0 \n\t":);
3645
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
754 for( l = bidir; l >= 0; l-- ) {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
755 __asm__ volatile(
3645
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
756 "movd %0, %%mm1 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
757 "punpckldq %1, %%mm1 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
758 "movq %%mm1, %%mm2 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
759 "psrlw $7, %%mm2 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
760 "pand %%mm6, %%mm2 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
761 "por %%mm2, %%mm1 \n\t" // ref_cache with -2 mapped to -1
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
762 "punpckldq %%mm1, %%mm2 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
763 "pcmpeqb %%mm2, %%mm1 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
764 "paddb %%mm6, %%mm1 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
765 "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn]
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
766 "por %%mm1, %%mm0 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
767
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
768 "movq %2, %%mm1 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
769 "movq %3, %%mm2 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
770 "psubw %4, %%mm1 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
771 "psubw %5, %%mm2 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
772 "packsswb %%mm2, %%mm1 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
773 "paddb %%mm5, %%mm1 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
774 "pminub %%mm4, %%mm1 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
775 "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
776 "por %%mm1, %%mm0 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
777 ::"m"(ref[l][b_idx]),
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
778 "m"(ref[l][b_idx+d_idx]),
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
779 "m"(mv[l][b_idx][0]),
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
780 "m"(mv[l][b_idx+2][0]),
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
781 "m"(mv[l][b_idx+d_idx][0]),
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
782 "m"(mv[l][b_idx+d_idx+2][0])
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
783 );
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
784 }
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
785 }
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
786 __asm__ volatile(
3645
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
787 "movd %0, %%mm1 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
788 "por %1, %%mm1 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
789 "punpcklbw %%mm7, %%mm1 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
790 "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn]
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
791 ::"m"(nnz[b_idx]),
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
792 "m"(nnz[b_idx+d_idx])
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
793 );
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
794 __asm__ volatile(
3645
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
795 "pcmpeqw %%mm7, %%mm0 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
796 "pcmpeqw %%mm7, %%mm0 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
797 "psrlw $15, %%mm0 \n\t" // nonzero -> 1
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
798 "psrlw $14, %%mm1 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
799 "movq %%mm0, %%mm2 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
800 "por %%mm1, %%mm2 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
801 "psrlw $1, %%mm1 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
802 "pandn %%mm2, %%mm1 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
803 "movq %%mm1, %0 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
804 :"=m"(*bS[dir][edge])
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
805 ::"memory"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
806 );
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
807 }
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
808 edges = 4;
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
809 step = 1;
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
810 }
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
811 __asm__ volatile(
3645
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
812 "movq (%0), %%mm0 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
813 "movq 8(%0), %%mm1 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
814 "movq 16(%0), %%mm2 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
815 "movq 24(%0), %%mm3 \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
816 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
817 "movq %%mm0, (%0) \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
818 "movq %%mm3, 8(%0) \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
819 "movq %%mm4, 16(%0) \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
820 "movq %%mm2, 24(%0) \n\t"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
821 ::"r"(bS[0])
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
822 :"memory"
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
823 );
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3394
diff changeset
824 }
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
825
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
826 /***********************************/
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
827 /* motion compensation */
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
828
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
829 #define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
830 "mov"#q" "#C", "#T" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
831 "mov"#d" (%0), "#F" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
832 "paddw "#D", "#T" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
833 "psllw $2, "#T" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
834 "psubw "#B", "#T" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
835 "psubw "#E", "#T" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
836 "punpcklbw "#Z", "#F" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
837 "pmullw %4, "#T" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
838 "paddw %5, "#A" \n\t"\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
839 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
840 "paddw "#F", "#A" \n\t"\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
841 "paddw "#A", "#T" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
842 "psraw $5, "#T" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
843 "packuswb "#T", "#T" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
844 OP(T, (%1), A, d)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
845 "add %3, %1 \n\t"
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
846
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
847 #define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
848 "mov"#q" "#C", "#T" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
849 "mov"#d" (%0), "#F" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
850 "paddw "#D", "#T" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
851 "psllw $2, "#T" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
852 "paddw %4, "#A" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
853 "psubw "#B", "#T" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
854 "psubw "#E", "#T" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
855 "punpcklbw "#Z", "#F" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
856 "pmullw %3, "#T" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
857 "paddw "#F", "#A" \n\t"\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
858 "add %2, %0 \n\t"\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
859 "paddw "#A", "#T" \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
860 "mov"#q" "#T", "#OF"(%1) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
861
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
862 #define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
863 #define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
864 #define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
865 #define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
866
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2922
diff changeset
867
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
868 #define QPEL_H264(OPNAME, OP, MMX)\
4527
481763d70193 prevent h.264 MC related functions from being inlined (yes this is much faster the code just doesnt fit in the code cache otherwise)
michael
parents: 4137
diff changeset
869 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
870 int h=4;\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
871 \
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
872 __asm__ volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
873 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
874 "movq %5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
875 "movq %6, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
876 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
877 "movd -1(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
878 "movd (%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
879 "movd 1(%0), %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
880 "movd 2(%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
881 "punpcklbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
882 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
883 "punpcklbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
884 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
885 "paddw %%mm0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
886 "paddw %%mm3, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
887 "movd -2(%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
888 "movd 3(%0), %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
889 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
890 "punpcklbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
891 "paddw %%mm3, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
892 "psllw $2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
893 "psubw %%mm1, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
894 "pmullw %%mm4, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
895 "paddw %%mm5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
896 "paddw %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
897 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
898 "packuswb %%mm0, %%mm0 \n\t"\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
899 OP(%%mm0, (%1),%%mm6, d)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
900 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
901 "add %4, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
902 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
903 " jnz 1b \n\t"\
6335
950811a14eb3 put loop counter in a register if possible. makes some of the qpel functions 3% faster.
lorenm
parents: 6334
diff changeset
904 : "+a"(src), "+c"(dst), "+g"(h)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
905 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
906 : "memory"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
907 );\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
908 }\
4527
481763d70193 prevent h.264 MC related functions from being inlined (yes this is much faster the code just doesnt fit in the code cache otherwise)
michael
parents: 4137
diff changeset
909 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
3156
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
910 int h=4;\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
911 __asm__ volatile(\
3156
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
912 "pxor %%mm7, %%mm7 \n\t"\
3165
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
913 "movq %0, %%mm4 \n\t"\
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
914 "movq %1, %%mm5 \n\t"\
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
915 :: "m"(ff_pw_5), "m"(ff_pw_16)\
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
916 );\
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
917 do{\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
918 __asm__ volatile(\
3156
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
919 "movd -1(%0), %%mm1 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
920 "movd (%0), %%mm2 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
921 "movd 1(%0), %%mm3 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
922 "movd 2(%0), %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
923 "punpcklbw %%mm7, %%mm1 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
924 "punpcklbw %%mm7, %%mm2 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
925 "punpcklbw %%mm7, %%mm3 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
926 "punpcklbw %%mm7, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
927 "paddw %%mm0, %%mm1 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
928 "paddw %%mm3, %%mm2 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
929 "movd -2(%0), %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
930 "movd 3(%0), %%mm3 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
931 "punpcklbw %%mm7, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
932 "punpcklbw %%mm7, %%mm3 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
933 "paddw %%mm3, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
934 "psllw $2, %%mm2 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
935 "psubw %%mm1, %%mm2 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
936 "pmullw %%mm4, %%mm2 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
937 "paddw %%mm5, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
938 "paddw %%mm2, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
939 "movd (%2), %%mm3 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
940 "psraw $5, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
941 "packuswb %%mm0, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
942 PAVGB" %%mm3, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
943 OP(%%mm0, (%1),%%mm6, d)\
3165
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
944 "add %4, %0 \n\t"\
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
945 "add %4, %1 \n\t"\
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
946 "add %3, %2 \n\t"\
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
947 : "+a"(src), "+c"(dst), "+d"(src2)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
948 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
3156
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
949 : "memory"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
950 );\
3165
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
951 }while(--h);\
3156
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
952 }\
4527
481763d70193 prevent h.264 MC related functions from being inlined (yes this is much faster the code just doesnt fit in the code cache otherwise)
michael
parents: 4137
diff changeset
953 static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
954 src -= 2*srcStride;\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
955 __asm__ volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
956 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
957 "movd (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
958 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
959 "movd (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
960 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
961 "movd (%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
962 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
963 "movd (%0), %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
964 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
965 "movd (%0), %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
966 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
967 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
968 "punpcklbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
969 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
970 "punpcklbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
971 "punpcklbw %%mm7, %%mm4 \n\t"\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
972 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
973 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
974 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
975 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
976 \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
977 : "+a"(src), "+c"(dst)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
978 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
979 : "memory"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
980 );\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
981 }\
4527
481763d70193 prevent h.264 MC related functions from being inlined (yes this is much faster the code just doesnt fit in the code cache otherwise)
michael
parents: 4137
diff changeset
982 static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
983 int h=4;\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
984 int w=3;\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
985 src -= 2*srcStride+2;\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
986 while(w--){\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
987 __asm__ volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
988 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
989 "movd (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
990 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
991 "movd (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
992 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
993 "movd (%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
994 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
995 "movd (%0), %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
996 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
997 "movd (%0), %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
998 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
999 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1000 "punpcklbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1001 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1002 "punpcklbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1003 "punpcklbw %%mm7, %%mm4 \n\t"\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1004 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1005 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1006 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1007 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1008 \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1009 : "+a"(src)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1010 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1011 : "memory"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1012 );\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1013 tmp += 4;\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1014 src += 4 - 9*srcStride;\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1015 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1016 tmp -= 3*4;\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1017 __asm__ volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1018 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1019 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1020 "paddw 10(%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1021 "movq 2(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1022 "paddw 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1023 "movq 4(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1024 "paddw 6(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1025 "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1026 "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1027 "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1028 "paddsw %%mm2, %%mm0 \n\t"\
3001
b52d8ee430f6 fix some potential arithmetic overflows in pred_direct_motion() and
lorenm
parents: 2979
diff changeset
1029 "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\
6330
5070b3157fcf add qpel rounder once during hv rather than twice during hv and whatever it's averaged with
lorenm
parents: 6329
diff changeset
1030 "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1031 "psraw $6, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1032 "packuswb %%mm0, %%mm0 \n\t"\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1033 OP(%%mm0, (%1),%%mm7, d)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1034 "add $24, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1035 "add %3, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1036 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1037 " jnz 1b \n\t"\
6335
950811a14eb3 put loop counter in a register if possible. makes some of the qpel functions 3% faster.
lorenm
parents: 6334
diff changeset
1038 : "+a"(tmp), "+c"(dst), "+g"(h)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1039 : "S"((x86_reg)dstStride)\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1040 : "memory"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1041 );\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1042 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1043 \
4527
481763d70193 prevent h.264 MC related functions from being inlined (yes this is much faster the code just doesnt fit in the code cache otherwise)
michael
parents: 4137
diff changeset
1044 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1045 int h=8;\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1046 __asm__ volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1047 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1048 "movq %5, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1049 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1050 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1051 "movq 1(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1052 "movq %%mm0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1053 "movq %%mm2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1054 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1055 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1056 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1057 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1058 "paddw %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1059 "paddw %%mm3, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1060 "psllw $2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1061 "psllw $2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1062 "movq -1(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1063 "movq 2(%0), %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1064 "movq %%mm2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1065 "movq %%mm4, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1066 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1067 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1068 "punpcklbw %%mm7, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1069 "punpckhbw %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1070 "paddw %%mm4, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1071 "paddw %%mm3, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1072 "psubw %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1073 "psubw %%mm5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1074 "pmullw %%mm6, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1075 "pmullw %%mm6, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1076 "movd -2(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1077 "movd 7(%0), %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1078 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1079 "punpcklbw %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1080 "paddw %%mm3, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1081 "paddw %%mm5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1082 "movq %6, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1083 "paddw %%mm5, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1084 "paddw %%mm5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1085 "paddw %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1086 "paddw %%mm4, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1087 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1088 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1089 "packuswb %%mm1, %%mm0 \n\t"\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1090 OP(%%mm0, (%1),%%mm5, q)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1091 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1092 "add %4, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1093 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1094 " jnz 1b \n\t"\
6335
950811a14eb3 put loop counter in a register if possible. makes some of the qpel functions 3% faster.
lorenm
parents: 6334
diff changeset
1095 : "+a"(src), "+c"(dst), "+g"(h)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1096 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1097 : "memory"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1098 );\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1099 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1100 \
4527
481763d70193 prevent h.264 MC related functions from being inlined (yes this is much faster the code just doesnt fit in the code cache otherwise)
michael
parents: 4137
diff changeset
1101 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
3156
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1102 int h=8;\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1103 __asm__ volatile(\
3156
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1104 "pxor %%mm7, %%mm7 \n\t"\
3165
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
1105 "movq %0, %%mm6 \n\t"\
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
1106 :: "m"(ff_pw_5)\
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
1107 );\
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
1108 do{\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1109 __asm__ volatile(\
3156
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1110 "movq (%0), %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1111 "movq 1(%0), %%mm2 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1112 "movq %%mm0, %%mm1 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1113 "movq %%mm2, %%mm3 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1114 "punpcklbw %%mm7, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1115 "punpckhbw %%mm7, %%mm1 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1116 "punpcklbw %%mm7, %%mm2 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1117 "punpckhbw %%mm7, %%mm3 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1118 "paddw %%mm2, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1119 "paddw %%mm3, %%mm1 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1120 "psllw $2, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1121 "psllw $2, %%mm1 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1122 "movq -1(%0), %%mm2 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1123 "movq 2(%0), %%mm4 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1124 "movq %%mm2, %%mm3 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1125 "movq %%mm4, %%mm5 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1126 "punpcklbw %%mm7, %%mm2 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1127 "punpckhbw %%mm7, %%mm3 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1128 "punpcklbw %%mm7, %%mm4 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1129 "punpckhbw %%mm7, %%mm5 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1130 "paddw %%mm4, %%mm2 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1131 "paddw %%mm3, %%mm5 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1132 "psubw %%mm2, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1133 "psubw %%mm5, %%mm1 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1134 "pmullw %%mm6, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1135 "pmullw %%mm6, %%mm1 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1136 "movd -2(%0), %%mm2 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1137 "movd 7(%0), %%mm5 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1138 "punpcklbw %%mm7, %%mm2 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1139 "punpcklbw %%mm7, %%mm5 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1140 "paddw %%mm3, %%mm2 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1141 "paddw %%mm5, %%mm4 \n\t"\
3165
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
1142 "movq %5, %%mm5 \n\t"\
3156
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1143 "paddw %%mm5, %%mm2 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1144 "paddw %%mm5, %%mm4 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1145 "paddw %%mm2, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1146 "paddw %%mm4, %%mm1 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1147 "psraw $5, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1148 "psraw $5, %%mm1 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1149 "movq (%2), %%mm4 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1150 "packuswb %%mm1, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1151 PAVGB" %%mm4, %%mm0 \n\t"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1152 OP(%%mm0, (%1),%%mm5, q)\
3165
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
1153 "add %4, %0 \n\t"\
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
1154 "add %4, %1 \n\t"\
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
1155 "add %3, %2 \n\t"\
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
1156 : "+a"(src), "+c"(dst), "+d"(src2)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1157 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
3165
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
1158 "m"(ff_pw_16)\
3156
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1159 : "memory"\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1160 );\
3165
8b51e108cba6 gcc2.95 workaround
lorenm
parents: 3163
diff changeset
1161 }while(--h);\
3156
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1162 }\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1163 \
4527
481763d70193 prevent h.264 MC related functions from being inlined (yes this is much faster the code just doesnt fit in the code cache otherwise)
michael
parents: 4137
diff changeset
1164 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
3094
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1165 int w= 2;\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1166 src -= 2*srcStride;\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1167 \
3094
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1168 while(w--){\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1169 __asm__ volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1170 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1171 "movd (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1172 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1173 "movd (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1174 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1175 "movd (%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1176 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1177 "movd (%0), %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1178 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1179 "movd (%0), %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1180 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1181 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1182 "punpcklbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1183 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1184 "punpcklbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1185 "punpcklbw %%mm7, %%mm4 \n\t"\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1186 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1187 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1188 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1189 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1190 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1191 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1192 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1193 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1194 \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1195 : "+a"(src), "+c"(dst)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1196 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1197 : "memory"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1198 );\
3094
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1199 if(h==16){\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1200 __asm__ volatile(\
3094
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1201 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1202 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1203 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1204 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1205 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1206 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1207 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1208 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1209 \
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1210 : "+a"(src), "+c"(dst)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1211 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
3094
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1212 : "memory"\
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1213 );\
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1214 }\
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1215 src += 4-(h+5)*srcStride;\
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1216 dst += 4-h*dstStride;\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1217 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1218 }\
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1219 static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
3093
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1220 int w = (size+8)>>2;\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1221 src -= 2*srcStride+2;\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1222 while(w--){\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1223 __asm__ volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1224 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1225 "movd (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1226 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1227 "movd (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1228 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1229 "movd (%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1230 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1231 "movd (%0), %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1232 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1233 "movd (%0), %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1234 "add %2, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1235 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1236 "punpcklbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1237 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1238 "punpcklbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1239 "punpcklbw %%mm7, %%mm4 \n\t"\
3093
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1240 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1241 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1242 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1243 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1244 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1245 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1246 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1247 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1248 : "+a"(src)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1249 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1250 : "memory"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1251 );\
3093
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1252 if(size==16){\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1253 __asm__ volatile(\
3093
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1254 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1255 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1256 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1257 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1258 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1259 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1260 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1261 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1262 : "+a"(src)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1263 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
3093
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1264 : "memory"\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1265 );\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1266 }\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1267 tmp += 4;\
3093
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1268 src += 4 - (size+5)*srcStride;\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1269 }\
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1270 }\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1271 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1272 int w = size>>4;\
3093
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1273 do{\
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1274 int h = size;\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1275 __asm__ volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1276 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1277 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1278 "movq 8(%0), %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1279 "movq 2(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1280 "movq 10(%0), %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1281 "paddw %%mm4, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1282 "paddw %%mm3, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1283 "paddw 18(%0), %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1284 "paddw 16(%0), %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1285 "movq 4(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1286 "movq 12(%0), %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1287 "paddw 6(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1288 "paddw 14(%0), %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1289 "psubw %%mm1, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1290 "psubw %%mm4, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1291 "psraw $2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1292 "psraw $2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1293 "psubw %%mm1, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1294 "psubw %%mm4, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1295 "paddsw %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1296 "paddsw %%mm5, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1297 "psraw $2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1298 "psraw $2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1299 "paddw %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1300 "paddw %%mm5, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1301 "psraw $6, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1302 "psraw $6, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1303 "packuswb %%mm3, %%mm0 \n\t"\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1304 OP(%%mm0, (%1),%%mm7, q)\
3093
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1305 "add $48, %0 \n\t"\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1306 "add %3, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1307 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1308 " jnz 1b \n\t"\
6335
950811a14eb3 put loop counter in a register if possible. makes some of the qpel functions 3% faster.
lorenm
parents: 6334
diff changeset
1309 : "+a"(tmp), "+c"(dst), "+g"(h)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1310 : "S"((x86_reg)dstStride)\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1311 : "memory"\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1312 );\
3093
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1313 tmp += 8 - size*24;\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1314 dst += 8 - size*dstStride;\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1315 }while(w--);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1316 }\
3094
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1317 \
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1318 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1319 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1320 }\
4527
481763d70193 prevent h.264 MC related functions from being inlined (yes this is much faster the code just doesnt fit in the code cache otherwise)
michael
parents: 4137
diff changeset
1321 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
3094
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1322 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
385fa360f618 11% faster put_h264_qpel16_v_lowpass_mmx2
lorenm
parents: 3093
diff changeset
1323 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1324 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1325 \
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1326 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1327 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1328 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1329 src += 8*srcStride;\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1330 dst += 8*dstStride;\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1331 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1332 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1333 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1334 \
4527
481763d70193 prevent h.264 MC related functions from being inlined (yes this is much faster the code just doesnt fit in the code cache otherwise)
michael
parents: 4137
diff changeset
1335 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
3156
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1336 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1337 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1338 src += 8*dstStride;\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1339 dst += 8*dstStride;\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1340 src2 += 8*src2Stride;\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1341 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1342 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1343 }\
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1344 \
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1345 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1346 put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1347 OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1348 }\
3093
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1349 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1350 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1351 }\
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1352 \
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1353 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
3093
19f3dd63af7d 15% faster put_h264_qpel16_hv_lowpass_mmx2
lorenm
parents: 3036
diff changeset
1354 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1355 }\
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1356 \
4527
481763d70193 prevent h.264 MC related functions from being inlined (yes this is much faster the code just doesnt fit in the code cache otherwise)
michael
parents: 4137
diff changeset
1357 static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1358 {\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1359 __asm__ volatile(\
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1360 "movq (%1), %%mm0 \n\t"\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1361 "movq 24(%1), %%mm1 \n\t"\
3102
fcc2892eeab3 10l in 1.12
lorenm
parents: 3096
diff changeset
1362 "psraw $5, %%mm0 \n\t"\
fcc2892eeab3 10l in 1.12
lorenm
parents: 3096
diff changeset
1363 "psraw $5, %%mm1 \n\t"\
3163
b67ef5ea4d99 remove some useless instructions
lorenm
parents: 3156
diff changeset
1364 "packuswb %%mm0, %%mm0 \n\t"\
b67ef5ea4d99 remove some useless instructions
lorenm
parents: 3156
diff changeset
1365 "packuswb %%mm1, %%mm1 \n\t"\
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1366 PAVGB" (%0), %%mm0 \n\t"\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1367 PAVGB" (%0,%3), %%mm1 \n\t"\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1368 OP(%%mm0, (%2), %%mm4, d)\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1369 OP(%%mm1, (%2,%4), %%mm5, d)\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1370 "lea (%0,%3,2), %0 \n\t"\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1371 "lea (%2,%4,2), %2 \n\t"\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1372 "movq 48(%1), %%mm0 \n\t"\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1373 "movq 72(%1), %%mm1 \n\t"\
3102
fcc2892eeab3 10l in 1.12
lorenm
parents: 3096
diff changeset
1374 "psraw $5, %%mm0 \n\t"\
fcc2892eeab3 10l in 1.12
lorenm
parents: 3096
diff changeset
1375 "psraw $5, %%mm1 \n\t"\
3163
b67ef5ea4d99 remove some useless instructions
lorenm
parents: 3156
diff changeset
1376 "packuswb %%mm0, %%mm0 \n\t"\
b67ef5ea4d99 remove some useless instructions
lorenm
parents: 3156
diff changeset
1377 "packuswb %%mm1, %%mm1 \n\t"\
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1378 PAVGB" (%0), %%mm0 \n\t"\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1379 PAVGB" (%0,%3), %%mm1 \n\t"\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1380 OP(%%mm0, (%2), %%mm4, d)\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1381 OP(%%mm1, (%2,%4), %%mm5, d)\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1382 :"+a"(src8), "+c"(src16), "+d"(dst)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1383 :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1384 :"memory");\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1385 }\
4527
481763d70193 prevent h.264 MC related functions from being inlined (yes this is much faster the code just doesnt fit in the code cache otherwise)
michael
parents: 4137
diff changeset
1386 static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1387 {\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1388 do{\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1389 __asm__ volatile(\
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1390 "movq (%1), %%mm0 \n\t"\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1391 "movq 8(%1), %%mm1 \n\t"\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1392 "movq 48(%1), %%mm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1393 "movq 8+48(%1), %%mm3 \n\t"\
3102
fcc2892eeab3 10l in 1.12
lorenm
parents: 3096
diff changeset
1394 "psraw $5, %%mm0 \n\t"\
fcc2892eeab3 10l in 1.12
lorenm
parents: 3096
diff changeset
1395 "psraw $5, %%mm1 \n\t"\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1396 "psraw $5, %%mm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1397 "psraw $5, %%mm3 \n\t"\
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1398 "packuswb %%mm1, %%mm0 \n\t"\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1399 "packuswb %%mm3, %%mm2 \n\t"\
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1400 PAVGB" (%0), %%mm0 \n\t"\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1401 PAVGB" (%0,%3), %%mm2 \n\t"\
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1402 OP(%%mm0, (%2), %%mm5, q)\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1403 OP(%%mm2, (%2,%4), %%mm5, q)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1404 ::"a"(src8), "c"(src16), "d"(dst),\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1405 "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1406 :"memory");\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1407 src8 += 2L*src8Stride;\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1408 src16 += 48;\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1409 dst += 2L*dstStride;\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1410 }while(h-=2);\
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1411 }\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1412 static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1413 {\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1414 OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1415 OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1416 }\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1417
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1418
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1419 #ifdef ARCH_X86_64
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1420 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1421 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1422 int h=16;\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1423 __asm__ volatile(\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1424 "pxor %%xmm15, %%xmm15 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1425 "movdqa %6, %%xmm14 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1426 "movdqa %7, %%xmm13 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1427 "1: \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1428 "lddqu 3(%0), %%xmm1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1429 "lddqu -5(%0), %%xmm7 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1430 "movdqa %%xmm1, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1431 "punpckhbw %%xmm15, %%xmm1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1432 "punpcklbw %%xmm15, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1433 "punpcklbw %%xmm15, %%xmm7 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1434 "movdqa %%xmm1, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1435 "movdqa %%xmm0, %%xmm6 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1436 "movdqa %%xmm1, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1437 "movdqa %%xmm0, %%xmm8 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1438 "movdqa %%xmm1, %%xmm4 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1439 "movdqa %%xmm0, %%xmm9 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1440 "movdqa %%xmm1, %%xmm5 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1441 "movdqa %%xmm0, %%xmm10 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1442 "palignr $6, %%xmm0, %%xmm5 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1443 "palignr $6, %%xmm7, %%xmm10\n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1444 "palignr $8, %%xmm0, %%xmm4 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1445 "palignr $8, %%xmm7, %%xmm9 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1446 "palignr $10,%%xmm0, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1447 "palignr $10,%%xmm7, %%xmm8 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1448 "paddw %%xmm1, %%xmm5 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1449 "paddw %%xmm0, %%xmm10 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1450 "palignr $12,%%xmm0, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1451 "palignr $12,%%xmm7, %%xmm6 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1452 "palignr $14,%%xmm0, %%xmm1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1453 "palignr $14,%%xmm7, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1454 "paddw %%xmm3, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1455 "paddw %%xmm8, %%xmm6 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1456 "paddw %%xmm4, %%xmm1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1457 "paddw %%xmm9, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1458 "psllw $2, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1459 "psllw $2, %%xmm6 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1460 "psubw %%xmm1, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1461 "psubw %%xmm0, %%xmm6 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1462 "paddw %%xmm13,%%xmm5 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1463 "paddw %%xmm13,%%xmm10 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1464 "pmullw %%xmm14,%%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1465 "pmullw %%xmm14,%%xmm6 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1466 "lddqu (%2), %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1467 "paddw %%xmm5, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1468 "paddw %%xmm10,%%xmm6 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1469 "psraw $5, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1470 "psraw $5, %%xmm6 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1471 "packuswb %%xmm2,%%xmm6 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1472 "pavgb %%xmm3, %%xmm6 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1473 OP(%%xmm6, (%1), %%xmm4, dqa)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1474 "add %5, %0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1475 "add %5, %1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1476 "add %4, %2 \n\t"\
6345
0b7fbd57c489 asm argument that might be in memory needs a size
lorenm
parents: 6336
diff changeset
1477 "decl %3 \n\t"\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1478 "jg 1b \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1479 : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1480 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1481 "m"(ff_pw_5), "m"(ff_pw_16)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1482 : "memory"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1483 );\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1484 }
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1485 #else // ARCH_X86_64
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1486 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1487 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1488 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1489 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1490 src += 8*dstStride;\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1491 dst += 8*dstStride;\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1492 src2 += 8*src2Stride;\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1493 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1494 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1495 }
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1496 #endif // ARCH_X86_64
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1497
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1498 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1499 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1500 int h=8;\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1501 __asm__ volatile(\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1502 "pxor %%xmm7, %%xmm7 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1503 "movdqa %0, %%xmm6 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1504 :: "m"(ff_pw_5)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1505 );\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1506 do{\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1507 __asm__ volatile(\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1508 "lddqu -5(%0), %%xmm1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1509 "movdqa %%xmm1, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1510 "punpckhbw %%xmm7, %%xmm1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1511 "punpcklbw %%xmm7, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1512 "movdqa %%xmm1, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1513 "movdqa %%xmm1, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1514 "movdqa %%xmm1, %%xmm4 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1515 "movdqa %%xmm1, %%xmm5 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1516 "palignr $6, %%xmm0, %%xmm5 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1517 "palignr $8, %%xmm0, %%xmm4 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1518 "palignr $10,%%xmm0, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1519 "paddw %%xmm1, %%xmm5 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1520 "palignr $12,%%xmm0, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1521 "palignr $14,%%xmm0, %%xmm1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1522 "paddw %%xmm3, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1523 "paddw %%xmm4, %%xmm1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1524 "psllw $2, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1525 "movq (%2), %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1526 "psubw %%xmm1, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1527 "paddw %5, %%xmm5 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1528 "pmullw %%xmm6, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1529 "paddw %%xmm5, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1530 "psraw $5, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1531 "packuswb %%xmm2, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1532 "pavgb %%xmm3, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1533 OP(%%xmm2, (%1), %%xmm4, q)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1534 "add %4, %0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1535 "add %4, %1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1536 "add %3, %2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1537 : "+a"(src), "+c"(dst), "+d"(src2)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1538 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1539 "m"(ff_pw_16)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1540 : "memory"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1541 );\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1542 }while(--h);\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1543 }\
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1544 QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1545 \
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1546 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1547 int h=8;\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1548 __asm__ volatile(\
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1549 "pxor %%xmm7, %%xmm7 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1550 "movdqa %5, %%xmm6 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1551 "1: \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1552 "lddqu -5(%0), %%xmm1 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1553 "movdqa %%xmm1, %%xmm0 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1554 "punpckhbw %%xmm7, %%xmm1 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1555 "punpcklbw %%xmm7, %%xmm0 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1556 "movdqa %%xmm1, %%xmm2 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1557 "movdqa %%xmm1, %%xmm3 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1558 "movdqa %%xmm1, %%xmm4 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1559 "movdqa %%xmm1, %%xmm5 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1560 "palignr $6, %%xmm0, %%xmm5 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1561 "palignr $8, %%xmm0, %%xmm4 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1562 "palignr $10,%%xmm0, %%xmm3 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1563 "paddw %%xmm1, %%xmm5 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1564 "palignr $12,%%xmm0, %%xmm2 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1565 "palignr $14,%%xmm0, %%xmm1 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1566 "paddw %%xmm3, %%xmm2 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1567 "paddw %%xmm4, %%xmm1 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1568 "psllw $2, %%xmm2 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1569 "psubw %%xmm1, %%xmm2 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1570 "paddw %6, %%xmm5 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1571 "pmullw %%xmm6, %%xmm2 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1572 "paddw %%xmm5, %%xmm2 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1573 "psraw $5, %%xmm2 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1574 "packuswb %%xmm2, %%xmm2 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1575 OP(%%xmm2, (%1), %%xmm4, q)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1576 "add %3, %0 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1577 "add %4, %1 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1578 "decl %2 \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1579 " jnz 1b \n\t"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1580 : "+a"(src), "+c"(dst), "+g"(h)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1581 : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1582 "m"(ff_pw_5), "m"(ff_pw_16)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1583 : "memory"\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1584 );\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1585 }\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1586 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1587 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1588 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1589 src += 8*srcStride;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1590 dst += 8*dstStride;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1591 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1592 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1593 }\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1594
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1595 #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1596 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1597 src -= 2*srcStride;\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1598 \
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1599 __asm__ volatile(\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1600 "pxor %%xmm7, %%xmm7 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1601 "movq (%0), %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1602 "add %2, %0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1603 "movq (%0), %%xmm1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1604 "add %2, %0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1605 "movq (%0), %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1606 "add %2, %0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1607 "movq (%0), %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1608 "add %2, %0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1609 "movq (%0), %%xmm4 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1610 "add %2, %0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1611 "punpcklbw %%xmm7, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1612 "punpcklbw %%xmm7, %%xmm1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1613 "punpcklbw %%xmm7, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1614 "punpcklbw %%xmm7, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1615 "punpcklbw %%xmm7, %%xmm4 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1616 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1617 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1618 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1619 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1620 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1621 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1622 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1623 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1624 \
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1625 : "+a"(src), "+c"(dst)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1626 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1627 : "memory"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1628 );\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1629 if(h==16){\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1630 __asm__ volatile(\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1631 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1632 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1633 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1634 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1635 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1636 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1637 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1638 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1639 \
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1640 : "+a"(src), "+c"(dst)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1641 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1642 : "memory"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1643 );\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1644 }\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1645 }\
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1646 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1647 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1648 }\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1649 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1650 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1651 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1652 }
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1653
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1654 static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1655 int w = (size+8)>>3;
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1656 src -= 2*srcStride+2;
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1657 while(w--){
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1658 __asm__ volatile(
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1659 "pxor %%xmm7, %%xmm7 \n\t"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1660 "movq (%0), %%xmm0 \n\t"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1661 "add %2, %0 \n\t"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1662 "movq (%0), %%xmm1 \n\t"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1663 "add %2, %0 \n\t"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1664 "movq (%0), %%xmm2 \n\t"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1665 "add %2, %0 \n\t"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1666 "movq (%0), %%xmm3 \n\t"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1667 "add %2, %0 \n\t"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1668 "movq (%0), %%xmm4 \n\t"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1669 "add %2, %0 \n\t"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1670 "punpcklbw %%xmm7, %%xmm0 \n\t"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1671 "punpcklbw %%xmm7, %%xmm1 \n\t"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1672 "punpcklbw %%xmm7, %%xmm2 \n\t"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1673 "punpcklbw %%xmm7, %%xmm3 \n\t"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1674 "punpcklbw %%xmm7, %%xmm4 \n\t"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1675 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1676 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1677 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1678 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1679 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1680 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1681 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1682 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1683 : "+a"(src)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1684 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1685 : "memory"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1686 );
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1687 if(size==16){
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1688 __asm__ volatile(
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1689 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1690 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1691 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1692 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1693 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1694 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1695 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1696 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1697 : "+a"(src)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1698 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1699 : "memory"
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1700 );
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1701 }
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1702 tmp += 8;
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1703 src += 8 - (size+5)*srcStride;
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1704 }
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1705 }
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1706
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1707 #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1708 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1709 int h = size;\
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1710 if(size == 16){\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1711 __asm__ volatile(\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1712 "1: \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1713 "movdqa 32(%0), %%xmm4 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1714 "movdqa 16(%0), %%xmm5 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1715 "movdqa (%0), %%xmm7 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1716 "movdqa %%xmm4, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1717 "movdqa %%xmm4, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1718 "movdqa %%xmm4, %%xmm1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1719 "movdqa %%xmm4, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1720 "palignr $10, %%xmm5, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1721 "palignr $8, %%xmm5, %%xmm1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1722 "palignr $6, %%xmm5, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1723 "palignr $4, %%xmm5, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1724 "palignr $2, %%xmm5, %%xmm4 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1725 "paddw %%xmm5, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1726 "paddw %%xmm4, %%xmm1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1727 "paddw %%xmm3, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1728 "movdqa %%xmm5, %%xmm6 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1729 "movdqa %%xmm5, %%xmm4 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1730 "movdqa %%xmm5, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1731 "palignr $8, %%xmm7, %%xmm4 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1732 "palignr $2, %%xmm7, %%xmm6 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1733 "palignr $10, %%xmm7, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1734 "paddw %%xmm6, %%xmm4 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1735 "movdqa %%xmm5, %%xmm6 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1736 "palignr $6, %%xmm7, %%xmm5 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1737 "palignr $4, %%xmm7, %%xmm6 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1738 "paddw %%xmm7, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1739 "paddw %%xmm6, %%xmm5 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1740 \
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1741 "psubw %%xmm1, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1742 "psubw %%xmm4, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1743 "psraw $2, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1744 "psraw $2, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1745 "psubw %%xmm1, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1746 "psubw %%xmm4, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1747 "paddw %%xmm2, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1748 "paddw %%xmm5, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1749 "psraw $2, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1750 "psraw $2, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1751 "paddw %%xmm2, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1752 "paddw %%xmm5, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1753 "psraw $6, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1754 "psraw $6, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1755 "packuswb %%xmm0, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1756 OP(%%xmm3, (%1), %%xmm7, dqa)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1757 "add $48, %0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1758 "add %3, %1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1759 "decl %2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1760 " jnz 1b \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1761 : "+a"(tmp), "+c"(dst), "+g"(h)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1762 : "S"((x86_reg)dstStride)\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1763 : "memory"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1764 );\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1765 }else{\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
1766 __asm__ volatile(\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1767 "1: \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1768 "movdqa 16(%0), %%xmm1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1769 "movdqa (%0), %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1770 "movdqa %%xmm1, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1771 "movdqa %%xmm1, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1772 "movdqa %%xmm1, %%xmm4 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1773 "movdqa %%xmm1, %%xmm5 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1774 "palignr $10, %%xmm0, %%xmm5 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1775 "palignr $8, %%xmm0, %%xmm4 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1776 "palignr $6, %%xmm0, %%xmm3 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1777 "palignr $4, %%xmm0, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1778 "palignr $2, %%xmm0, %%xmm1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1779 "paddw %%xmm5, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1780 "paddw %%xmm4, %%xmm1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1781 "paddw %%xmm3, %%xmm2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1782 "psubw %%xmm1, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1783 "psraw $2, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1784 "psubw %%xmm1, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1785 "paddw %%xmm2, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1786 "psraw $2, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1787 "paddw %%xmm2, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1788 "psraw $6, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1789 "packuswb %%xmm0, %%xmm0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1790 OP(%%xmm0, (%1), %%xmm7, q)\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1791 "add $48, %0 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1792 "add %3, %1 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1793 "decl %2 \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1794 " jnz 1b \n\t"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1795 : "+a"(tmp), "+c"(dst), "+g"(h)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6557
diff changeset
1796 : "S"((x86_reg)dstStride)\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1797 : "memory"\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1798 );\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1799 }\
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1800 }
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1801
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1802 #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1803 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1804 put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1805 OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1806 }\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1807 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1808 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1809 }\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1810 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1811 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1812 }\
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1813
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1814 #define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1815 #define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1816 #define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1817 #define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1818 #define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1819 #define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1820 #define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1821 #define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1822
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1823 #define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1824 #define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1825 #define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1826 #define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1827 #define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1828 #define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1829 #define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1830 #define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1831
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1832 #define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1833 #define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1834 #define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1835 #define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1836
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1837 #define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1838 #define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1839 #define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1840 #define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1841
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1842 #define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1843 #define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1844
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1845 #define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1846 H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1847 H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1848 H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1849 H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1850
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1851 static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1852 put_pixels16_sse2(dst, src, stride, 16);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1853 }
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1854 static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1855 avg_pixels16_sse2(dst, src, stride, 16);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1856 }
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1857 #define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1858 #define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1859
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1860 #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1861 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
6321
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
1862 OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1863 }\
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1864
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1865 #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1866 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
3156
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1867 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1868 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1869 \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1870 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1871 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1872 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1873 \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1874 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
3156
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1875 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1876 }\
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1877
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1878 #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1879 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6334
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1880 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1881 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1882 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1883 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1884 \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1885 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1886 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1887 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1888 \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1889 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6334
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1890 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1891 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1892 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1893 }\
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1894
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1895 #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1896 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6334
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1897 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1898 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1899 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1900 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1901 \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1902 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6334
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1903 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1904 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1905 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1906 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1907 \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1908 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6334
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1909 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1910 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1911 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1912 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1913 \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1914 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6334
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1915 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1916 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1917 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1918 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1919 \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1920 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6334
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1921 DECLARE_ALIGNED(ALIGN, uint16_t, temp[SIZE*(SIZE<8?12:24)]);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1922 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1923 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1924 \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1925 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6334
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1926 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1927 uint8_t * const halfHV= temp;\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1928 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
5809
7301ea0ae221 Fix intended order of operations for 4 assert() checks.
cehoyos
parents: 4939
diff changeset
1929 assert(((int)temp & 7) == 0);\
6334
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1930 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
3156
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1931 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1932 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1933 \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1934 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6334
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1935 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1936 uint8_t * const halfHV= temp;\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1937 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
5809
7301ea0ae221 Fix intended order of operations for 4 assert() checks.
cehoyos
parents: 4939
diff changeset
1938 assert(((int)temp & 7) == 0);\
6334
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1939 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
3156
5b6d0dd37ca7 4% faster h264_qpel_mc
lorenm
parents: 3105
diff changeset
1940 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1941 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1942 \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1943 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6334
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1944 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1945 uint8_t * const halfHV= temp;\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1946 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
5809
7301ea0ae221 Fix intended order of operations for 4 assert() checks.
cehoyos
parents: 4939
diff changeset
1947 assert(((int)temp & 7) == 0);\
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1948 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1949 OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1950 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1951 \
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1952 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6334
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1953 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1954 uint8_t * const halfHV= temp;\
bf8c9e82c7ad fix aliasing warnings. simpler too.
lorenm
parents: 6331
diff changeset
1955 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
5809
7301ea0ae221 Fix intended order of operations for 4 assert() checks.
cehoyos
parents: 4939
diff changeset
1956 assert(((int)temp & 7) == 0);\
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1957 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1958 OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1959 }\
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1960
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1961 #define H264_MC_4816(MMX)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1962 H264_MC(put_, 4, MMX, 8)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1963 H264_MC(put_, 8, MMX, 8)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1964 H264_MC(put_, 16,MMX, 8)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1965 H264_MC(avg_, 4, MMX, 8)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1966 H264_MC(avg_, 8, MMX, 8)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1967 H264_MC(avg_, 16,MMX, 8)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1968
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1969 #define H264_MC_816(QPEL, XMM)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1970 QPEL(put_, 8, XMM, 16)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1971 QPEL(put_, 16,XMM, 16)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1972 QPEL(avg_, 8, XMM, 16)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1973 QPEL(avg_, 16,XMM, 16)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1974
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1975
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1976 #define AVG_3DNOW_OP(a,b,temp, size) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1977 "mov" #size " " #b ", " #temp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1978 "pavgusb " #temp ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1979 "mov" #size " " #a ", " #b " \n\t"
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1980 #define AVG_MMX2_OP(a,b,temp, size) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1981 "mov" #size " " #b ", " #temp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1982 "pavgb " #temp ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1983 "mov" #size " " #a ", " #b " \n\t"
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1984
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1985 #define PAVGB "pavgusb"
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1986 QPEL_H264(put_, PUT_OP, 3dnow)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1987 QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1988 #undef PAVGB
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
1989 #define PAVGB "pavgb"
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1990 QPEL_H264(put_, PUT_OP, mmx2)
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
1991 QPEL_H264(avg_, AVG_MMX2_OP, mmx2)
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1992 QPEL_H264_V_XMM(put_, PUT_OP, sse2)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1993 QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1994 QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1995 QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2)
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
1996 #ifdef HAVE_SSSE3
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1997 QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1998 QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
1999 QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2000 QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2001 QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2002 QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3)
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
2003 #endif
3095
66987f98caae 18% faster put_h264_qpel16_mc[13]2_mmx2
lorenm
parents: 3094
diff changeset
2004 #undef PAVGB
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2005
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2006 H264_MC_4816(3dnow)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2007 H264_MC_4816(mmx2)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2008 H264_MC_816(H264_MC_V, sse2)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2009 H264_MC_816(H264_MC_HV, sse2)
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
2010 #ifdef HAVE_SSSE3
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2011 H264_MC_816(H264_MC_H, ssse3)
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2012 H264_MC_816(H264_MC_HV, ssse3)
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
2013 #endif
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6330
diff changeset
2014
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2015
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2016 #define H264_CHROMA_OP(S,D)
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2017 #define H264_CHROMA_OP4(S,D,T)
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2018 #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2019 #define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_mmx
3213
57d31bdbebe8 added mmx implementation of h264_chroma_mc2
lorenm
parents: 3174
diff changeset
2020 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2021 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2022 #include "dsputil_h264_template_mmx.c"
6057
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2023
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2024 static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2025 {
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2026 put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 1);
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2027 }
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2028 static void put_h264_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2029 {
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2030 put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 0);
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2031 }
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2032
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2033 #undef H264_CHROMA_OP
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2034 #undef H264_CHROMA_OP4
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2035 #undef H264_CHROMA_MC8_TMPL
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2036 #undef H264_CHROMA_MC4_TMPL
3213
57d31bdbebe8 added mmx implementation of h264_chroma_mc2
lorenm
parents: 3174
diff changeset
2037 #undef H264_CHROMA_MC2_TMPL
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2038 #undef H264_CHROMA_MC8_MV0
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2039
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2040 #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2041 #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2042 "pavgb " #T ", " #D " \n\t"
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2043 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2044 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_mmx2
3213
57d31bdbebe8 added mmx implementation of h264_chroma_mc2
lorenm
parents: 3174
diff changeset
2045 #define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2046 #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2047 #include "dsputil_h264_template_mmx.c"
6057
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2048 static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2049 {
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2050 avg_h264_chroma_mc8_mmx2(dst, src, stride, h, x, y, 1);
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2051 }
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2052 #undef H264_CHROMA_OP
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2053 #undef H264_CHROMA_OP4
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2054 #undef H264_CHROMA_MC8_TMPL
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2055 #undef H264_CHROMA_MC4_TMPL
3213
57d31bdbebe8 added mmx implementation of h264_chroma_mc2
lorenm
parents: 3174
diff changeset
2056 #undef H264_CHROMA_MC2_TMPL
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2057 #undef H264_CHROMA_MC8_MV0
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2058
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2059 #define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2060 #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2061 "pavgusb " #T ", " #D " \n\t"
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2062 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_3dnow
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2063 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_3dnow
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2064 #define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2065 #include "dsputil_h264_template_mmx.c"
6057
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2066 static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2067 {
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2068 avg_h264_chroma_mc8_3dnow(dst, src, stride, h, x, y, 1);
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 5947
diff changeset
2069 }
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2070 #undef H264_CHROMA_OP
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2071 #undef H264_CHROMA_OP4
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2072 #undef H264_CHROMA_MC8_TMPL
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2073 #undef H264_CHROMA_MC4_TMPL
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2074 #undef H264_CHROMA_MC8_MV0
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents:
diff changeset
2075
6557
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2076 #ifdef HAVE_SSSE3
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2077 #define AVG_OP(X)
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2078 #undef H264_CHROMA_MC8_TMPL
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2079 #undef H264_CHROMA_MC4_TMPL
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2080 #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2081 #define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2082 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2083 #include "dsputil_h264_template_ssse3.c"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2084 static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2085 {
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2086 put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2087 }
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2088 static void put_h264_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2089 {
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2090 put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0);
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2091 }
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2092
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2093 #undef AVG_OP
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2094 #undef H264_CHROMA_MC8_TMPL
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2095 #undef H264_CHROMA_MC4_TMPL
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2096 #undef H264_CHROMA_MC8_MV0
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2097 #define AVG_OP(X) X
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2098 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2099 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2100 #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2101 #include "dsputil_h264_template_ssse3.c"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2102 static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2103 {
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2104 avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2105 }
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2106 #undef AVG_OP
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2107 #undef H264_CHROMA_MC8_TMPL
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2108 #undef H264_CHROMA_MC4_TMPL
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2109 #undef H264_CHROMA_MC8_MV0
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2110 #endif
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6345
diff changeset
2111
2902
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2112 /***********************************/
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2113 /* weighted prediction */
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2114
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2115 static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2116 {
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2117 int x, y;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2118 offset <<= log2_denom;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2119 offset += (1 << log2_denom) >> 1;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
2120 __asm__ volatile(
2902
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2121 "movd %0, %%mm4 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2122 "movd %1, %%mm5 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2123 "movd %2, %%mm6 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2124 "pshufw $0, %%mm4, %%mm4 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2125 "pshufw $0, %%mm5, %%mm5 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2126 "pxor %%mm7, %%mm7 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2127 :: "g"(weight), "g"(offset), "g"(log2_denom)
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2128 );
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2129 for(y=0; y<h; y+=2){
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2130 for(x=0; x<w; x+=4){
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
2131 __asm__ volatile(
2902
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2132 "movd %0, %%mm0 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2133 "movd %1, %%mm1 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2134 "punpcklbw %%mm7, %%mm0 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2135 "punpcklbw %%mm7, %%mm1 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2136 "pmullw %%mm4, %%mm0 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2137 "pmullw %%mm4, %%mm1 \n\t"
3001
b52d8ee430f6 fix some potential arithmetic overflows in pred_direct_motion() and
lorenm
parents: 2979
diff changeset
2138 "paddsw %%mm5, %%mm0 \n\t"
b52d8ee430f6 fix some potential arithmetic overflows in pred_direct_motion() and
lorenm
parents: 2979
diff changeset
2139 "paddsw %%mm5, %%mm1 \n\t"
2902
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2140 "psraw %%mm6, %%mm0 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2141 "psraw %%mm6, %%mm1 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2142 "packuswb %%mm7, %%mm0 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2143 "packuswb %%mm7, %%mm1 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2144 "movd %%mm0, %0 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2145 "movd %%mm1, %1 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2146 : "+m"(*(uint32_t*)(dst+x)),
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2147 "+m"(*(uint32_t*)(dst+x+stride))
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2148 );
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2149 }
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2150 dst += 2*stride;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2151 }
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2152 }
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2153
3029
7a92269a3ccd tweak h264_biweight
lorenm
parents: 3001
diff changeset
2154 static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
2902
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2155 {
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2156 int x, y;
3029
7a92269a3ccd tweak h264_biweight
lorenm
parents: 3001
diff changeset
2157 offset = ((offset + 1) | 1) << log2_denom;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
2158 __asm__ volatile(
2902
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2159 "movd %0, %%mm3 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2160 "movd %1, %%mm4 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2161 "movd %2, %%mm5 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2162 "movd %3, %%mm6 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2163 "pshufw $0, %%mm3, %%mm3 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2164 "pshufw $0, %%mm4, %%mm4 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2165 "pshufw $0, %%mm5, %%mm5 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2166 "pxor %%mm7, %%mm7 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2167 :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2168 );
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2169 for(y=0; y<h; y++){
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2170 for(x=0; x<w; x+=4){
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7327
diff changeset
2171 __asm__ volatile(
2902
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2172 "movd %0, %%mm0 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2173 "movd %1, %%mm1 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2174 "punpcklbw %%mm7, %%mm0 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2175 "punpcklbw %%mm7, %%mm1 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2176 "pmullw %%mm3, %%mm0 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2177 "pmullw %%mm4, %%mm1 \n\t"
3001
b52d8ee430f6 fix some potential arithmetic overflows in pred_direct_motion() and
lorenm
parents: 2979
diff changeset
2178 "paddsw %%mm1, %%mm0 \n\t"
b52d8ee430f6 fix some potential arithmetic overflows in pred_direct_motion() and
lorenm
parents: 2979
diff changeset
2179 "paddsw %%mm5, %%mm0 \n\t"
2902
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2180 "psraw %%mm6, %%mm0 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2181 "packuswb %%mm0, %%mm0 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2182 "movd %%mm0, %0 \n\t"
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2183 : "+m"(*(uint32_t*)(dst+x))
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2184 : "m"(*(uint32_t*)(src+x))
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2185 );
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2186 }
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2187 src += stride;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2188 dst += stride;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2189 }
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2190 }
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2191
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2192 #define H264_WEIGHT(W,H) \
3029
7a92269a3ccd tweak h264_biweight
lorenm
parents: 3001
diff changeset
2193 static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
7a92269a3ccd tweak h264_biweight
lorenm
parents: 3001
diff changeset
2194 ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
2902
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2195 } \
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2196 static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2197 ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2198 }
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2199
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2200 H264_WEIGHT(16,16)
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2201 H264_WEIGHT(16, 8)
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2202 H264_WEIGHT( 8,16)
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2203 H264_WEIGHT( 8, 8)
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2204 H264_WEIGHT( 8, 4)
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2205 H264_WEIGHT( 4, 8)
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2206 H264_WEIGHT( 4, 4)
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2207 H264_WEIGHT( 4, 2)
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2855
diff changeset
2208