Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 5591:642588a60570 libavcodec
update mmx code to latest snow changes
note, the code likely can overflow and thus needs some more changes
sse2 updated too but disabled as it is untested
author | michael |
---|---|
date | Sat, 25 Aug 2007 15:20:56 +0000 |
parents | 3ae03eacbe9f |
children | 384629ebcb93 |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
1739
07a484280a82
copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents:
1729
diff
changeset
|
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
0 | 5 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3932
diff
changeset
|
6 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3932
diff
changeset
|
7 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3932
diff
changeset
|
8 * FFmpeg is free software; you can redistribute it and/or |
429 | 9 * modify it under the terms of the GNU Lesser General Public |
10 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3932
diff
changeset
|
11 * version 2.1 of the License, or (at your option) any later version. |
0 | 12 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3932
diff
changeset
|
13 * FFmpeg is distributed in the hope that it will be useful, |
0 | 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 * Lesser General Public License for more details. | |
0 | 17 * |
429 | 18 * You should have received a copy of the GNU Lesser General Public |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3932
diff
changeset
|
19 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
0 | 21 * |
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
23 */ | |
24 | |
5010
d5ba514e3f4a
Add libavcodec to compiler include flags in order to simplify header
diego
parents:
5007
diff
changeset
|
25 #include "dsputil.h" |
d5ba514e3f4a
Add libavcodec to compiler include flags in order to simplify header
diego
parents:
5007
diff
changeset
|
26 #include "simple_idct.h" |
d5ba514e3f4a
Add libavcodec to compiler include flags in order to simplify header
diego
parents:
5007
diff
changeset
|
27 #include "mpegvideo.h" |
3398
e0927bc44a10
Move REG_* macros from libavcodec/i386/mmx.h to libavutil/x86_cpu.h
lucabe
parents:
3250
diff
changeset
|
28 #include "x86_cpu.h" |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
29 #include "mmx.h" |
5014
42b99a3aadde
better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents:
5010
diff
changeset
|
30 #include "vp3dsp_mmx.h" |
42b99a3aadde
better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents:
5010
diff
changeset
|
31 #include "vp3dsp_sse2.h" |
5277
7b3fcb7c61ce
Avoid linking with h263.c functions when the relevant codecs
aurel
parents:
5255
diff
changeset
|
32 #include "h263.h" |
0 | 33 |
1729 | 34 //#undef NDEBUG |
35 //#include <assert.h> | |
36 | |
2868 | 37 extern void ff_idct_xvid_mmx(short *block); |
38 extern void ff_idct_xvid_mmx2(short *block); | |
1647 | 39 |
4197 | 40 int mm_flags; /* multimedia extension flags */ |
936 | 41 |
0 | 42 /* pixel operations */ |
1845
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
43 static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL; |
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
44 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL; |
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
45 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL; |
0 | 46 |
3557
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
47 static const uint64_t ff_pdw_80000000[2] attribute_used __attribute__ ((aligned(16))) = |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
48 {0x8000000080000000ULL, 0x8000000080000000ULL}; |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
49 |
1845
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
50 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL; |
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
51 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL; |
2633 | 52 static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL; |
2209 | 53 static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL; |
2922
d772011258ec
faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents:
2902
diff
changeset
|
54 static const uint64_t ff_pw_8 attribute_used __attribute__ ((aligned(8))) = 0x0008000800080008ULL; |
1845
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
55 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL; |
2209 | 56 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL; |
2754 | 57 static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL; |
1845
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
58 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; |
954 | 59 |
3645
47821be55b6c
mmx implementation of deblocking strength decision.
lorenm
parents:
3576
diff
changeset
|
60 static const uint64_t ff_pb_1 attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL; |
47821be55b6c
mmx implementation of deblocking strength decision.
lorenm
parents:
3576
diff
changeset
|
61 static const uint64_t ff_pb_3 attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL; |
47821be55b6c
mmx implementation of deblocking strength decision.
lorenm
parents:
3576
diff
changeset
|
62 static const uint64_t ff_pb_7 attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL; |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
63 static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL; |
4127 | 64 static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA1A1A1A1A1A1A1A1ULL; |
65 static const uint64_t ff_pb_5F attribute_used __attribute__ ((aligned(8))) = 0x5F5F5F5F5F5F5F5FULL; | |
1845
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
66 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL; |
1647 | 67 |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3574
diff
changeset
|
68 #define JUMPALIGN() __asm __volatile (ASMALIGN(3)::) |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
69 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
70 |
448 | 71 #define MOVQ_WONE(regd) \ |
72 __asm __volatile ( \ | |
73 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
74 "psrlw $15, %%" #regd ::) | |
75 | |
76 #define MOVQ_BFE(regd) \ | |
77 __asm __volatile ( \ | |
78 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ | |
79 "paddb %%" #regd ", %%" #regd " \n\t" ::) | |
80 | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
81 #ifndef PIC |
448 | 82 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone)) |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
83 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
84 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
85 // for shared library it's better to use this way for accessing constants |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
86 // pcmpeqd -> -1 |
448 | 87 #define MOVQ_BONE(regd) \ |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
88 __asm __volatile ( \ |
448 | 89 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
90 "psrlw $15, %%" #regd " \n\t" \ | |
91 "packuswb %%" #regd ", %%" #regd " \n\t" ::) | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
92 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
93 #define MOVQ_WTWO(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
94 __asm __volatile ( \ |
448 | 95 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
96 "psrlw $15, %%" #regd " \n\t" \ | |
97 "psllw $1, %%" #regd " \n\t"::) | |
387 | 98 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
99 #endif |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
100 |
448 | 101 // using regr as temporary and for the output result |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
102 // first argument is unmodifed and second is trashed |
471 | 103 // regfe is supposed to contain 0xfefefefefefefefe |
104 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ | |
2979 | 105 "movq " #rega ", " #regr " \n\t"\ |
106 "pand " #regb ", " #regr " \n\t"\ | |
107 "pxor " #rega ", " #regb " \n\t"\ | |
108 "pand " #regfe "," #regb " \n\t"\ | |
109 "psrlq $1, " #regb " \n\t"\ | |
110 "paddb " #regb ", " #regr " \n\t" | |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
111 |
471 | 112 #define PAVGB_MMX(rega, regb, regr, regfe) \ |
2979 | 113 "movq " #rega ", " #regr " \n\t"\ |
114 "por " #regb ", " #regr " \n\t"\ | |
115 "pxor " #rega ", " #regb " \n\t"\ | |
116 "pand " #regfe "," #regb " \n\t"\ | |
117 "psrlq $1, " #regb " \n\t"\ | |
118 "psubb " #regb ", " #regr " \n\t" | |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
119 |
471 | 120 // mm6 is supposed to contain 0xfefefefefefefefe |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
121 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ |
2979 | 122 "movq " #rega ", " #regr " \n\t"\ |
123 "movq " #regc ", " #regp " \n\t"\ | |
124 "pand " #regb ", " #regr " \n\t"\ | |
125 "pand " #regd ", " #regp " \n\t"\ | |
126 "pxor " #rega ", " #regb " \n\t"\ | |
127 "pxor " #regc ", " #regd " \n\t"\ | |
128 "pand %%mm6, " #regb " \n\t"\ | |
129 "pand %%mm6, " #regd " \n\t"\ | |
130 "psrlq $1, " #regb " \n\t"\ | |
131 "psrlq $1, " #regd " \n\t"\ | |
132 "paddb " #regb ", " #regr " \n\t"\ | |
133 "paddb " #regd ", " #regp " \n\t" | |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
134 |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
135 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ |
2979 | 136 "movq " #rega ", " #regr " \n\t"\ |
137 "movq " #regc ", " #regp " \n\t"\ | |
138 "por " #regb ", " #regr " \n\t"\ | |
139 "por " #regd ", " #regp " \n\t"\ | |
140 "pxor " #rega ", " #regb " \n\t"\ | |
141 "pxor " #regc ", " #regd " \n\t"\ | |
142 "pand %%mm6, " #regb " \n\t"\ | |
143 "pand %%mm6, " #regd " \n\t"\ | |
144 "psrlq $1, " #regd " \n\t"\ | |
145 "psrlq $1, " #regb " \n\t"\ | |
146 "psubb " #regb ", " #regr " \n\t"\ | |
147 "psubb " #regd ", " #regp " \n\t" | |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
148 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
149 /***********************************/ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
150 /* MMX no rounding */ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
151 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx |
448 | 152 #define SET_RND MOVQ_WONE |
2979 | 153 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) |
154 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) | |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
155 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
156 #include "dsputil_mmx_rnd.h" |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
157 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
158 #undef DEF |
448 | 159 #undef SET_RND |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
160 #undef PAVGBP |
471 | 161 #undef PAVGB |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
162 /***********************************/ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
163 /* MMX rounding */ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
164 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
165 #define DEF(x, y) x ## _ ## y ##_mmx |
448 | 166 #define SET_RND MOVQ_WTWO |
2979 | 167 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) |
168 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) | |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
169 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
170 #include "dsputil_mmx_rnd.h" |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
171 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
172 #undef DEF |
448 | 173 #undef SET_RND |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
174 #undef PAVGBP |
471 | 175 #undef PAVGB |
387 | 176 |
0 | 177 /***********************************/ |
178 /* 3Dnow specific */ | |
179 | |
180 #define DEF(x) x ## _3dnow | |
181 #define PAVGB "pavgusb" | |
182 | |
183 #include "dsputil_mmx_avg.h" | |
184 | |
185 #undef DEF | |
186 #undef PAVGB | |
187 | |
188 /***********************************/ | |
189 /* MMX2 specific */ | |
190 | |
386 | 191 #define DEF(x) x ## _mmx2 |
0 | 192 |
193 /* Introduced only in MMX2 set */ | |
194 #define PAVGB "pavgb" | |
195 | |
196 #include "dsputil_mmx_avg.h" | |
197 | |
198 #undef DEF | |
199 #undef PAVGB | |
200 | |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
201 #define SBUTTERFLY(a,b,t,n,m)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
202 "mov" #m " " #a ", " #t " \n\t" /* abcd */\ |
3416 | 203 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ |
204 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ | |
205 | |
4939 | 206 #define TRANSPOSE4(a,b,c,d,t)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
207 SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
208 SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
209 SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
210 SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ |
4939 | 211 |
0 | 212 /***********************************/ |
213 /* standard MMX */ | |
214 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
215 #ifdef CONFIG_ENCODERS |
1064 | 216 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) |
0 | 217 { |
386 | 218 asm volatile( |
2979 | 219 "mov $-128, %%"REG_a" \n\t" |
220 "pxor %%mm7, %%mm7 \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3574
diff
changeset
|
221 ASMALIGN(4) |
2979 | 222 "1: \n\t" |
223 "movq (%0), %%mm0 \n\t" | |
224 "movq (%0, %2), %%mm2 \n\t" | |
225 "movq %%mm0, %%mm1 \n\t" | |
226 "movq %%mm2, %%mm3 \n\t" | |
227 "punpcklbw %%mm7, %%mm0 \n\t" | |
228 "punpckhbw %%mm7, %%mm1 \n\t" | |
229 "punpcklbw %%mm7, %%mm2 \n\t" | |
230 "punpckhbw %%mm7, %%mm3 \n\t" | |
231 "movq %%mm0, (%1, %%"REG_a") \n\t" | |
232 "movq %%mm1, 8(%1, %%"REG_a") \n\t" | |
233 "movq %%mm2, 16(%1, %%"REG_a") \n\t" | |
234 "movq %%mm3, 24(%1, %%"REG_a") \n\t" | |
235 "add %3, %0 \n\t" | |
236 "add $32, %%"REG_a" \n\t" | |
237 "js 1b \n\t" | |
386 | 238 : "+r" (pixels) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
239 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
240 : "%"REG_a |
386 | 241 ); |
0 | 242 } |
243 | |
1064 | 244 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) |
324 | 245 { |
246 asm volatile( | |
2979 | 247 "pxor %%mm7, %%mm7 \n\t" |
248 "mov $-128, %%"REG_a" \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3574
diff
changeset
|
249 ASMALIGN(4) |
2979 | 250 "1: \n\t" |
251 "movq (%0), %%mm0 \n\t" | |
252 "movq (%1), %%mm2 \n\t" | |
253 "movq %%mm0, %%mm1 \n\t" | |
254 "movq %%mm2, %%mm3 \n\t" | |
255 "punpcklbw %%mm7, %%mm0 \n\t" | |
256 "punpckhbw %%mm7, %%mm1 \n\t" | |
257 "punpcklbw %%mm7, %%mm2 \n\t" | |
258 "punpckhbw %%mm7, %%mm3 \n\t" | |
259 "psubw %%mm2, %%mm0 \n\t" | |
260 "psubw %%mm3, %%mm1 \n\t" | |
261 "movq %%mm0, (%2, %%"REG_a") \n\t" | |
262 "movq %%mm1, 8(%2, %%"REG_a") \n\t" | |
263 "add %3, %0 \n\t" | |
264 "add %3, %1 \n\t" | |
265 "add $16, %%"REG_a" \n\t" | |
266 "jnz 1b \n\t" | |
324 | 267 : "+r" (s1), "+r" (s2) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
268 : "r" (block+64), "r" ((long)stride) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
269 : "%"REG_a |
324 | 270 ); |
271 } | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
272 #endif //CONFIG_ENCODERS |
324 | 273 |
1064 | 274 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
0 | 275 { |
276 const DCTELEM *p; | |
1064 | 277 uint8_t *pix; |
0 | 278 |
279 /* read the pixels */ | |
280 p = block; | |
281 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
282 /* unrolled loop */ |
2979 | 283 __asm __volatile( |
284 "movq %3, %%mm0 \n\t" | |
285 "movq 8%3, %%mm1 \n\t" | |
286 "movq 16%3, %%mm2 \n\t" | |
287 "movq 24%3, %%mm3 \n\t" | |
288 "movq 32%3, %%mm4 \n\t" | |
289 "movq 40%3, %%mm5 \n\t" | |
290 "movq 48%3, %%mm6 \n\t" | |
291 "movq 56%3, %%mm7 \n\t" | |
292 "packuswb %%mm1, %%mm0 \n\t" | |
293 "packuswb %%mm3, %%mm2 \n\t" | |
294 "packuswb %%mm5, %%mm4 \n\t" | |
295 "packuswb %%mm7, %%mm6 \n\t" | |
296 "movq %%mm0, (%0) \n\t" | |
297 "movq %%mm2, (%0, %1) \n\t" | |
298 "movq %%mm4, (%0, %1, 2) \n\t" | |
299 "movq %%mm6, (%0, %2) \n\t" | |
300 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p) | |
301 :"memory"); | |
0 | 302 pix += line_size*4; |
303 p += 32; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
304 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
305 // if here would be an exact copy of the code above |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
306 // compiler would generate some very strange code |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
307 // thus using "r" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
308 __asm __volatile( |
2979 | 309 "movq (%3), %%mm0 \n\t" |
310 "movq 8(%3), %%mm1 \n\t" | |
311 "movq 16(%3), %%mm2 \n\t" | |
312 "movq 24(%3), %%mm3 \n\t" | |
313 "movq 32(%3), %%mm4 \n\t" | |
314 "movq 40(%3), %%mm5 \n\t" | |
315 "movq 48(%3), %%mm6 \n\t" | |
316 "movq 56(%3), %%mm7 \n\t" | |
317 "packuswb %%mm1, %%mm0 \n\t" | |
318 "packuswb %%mm3, %%mm2 \n\t" | |
319 "packuswb %%mm5, %%mm4 \n\t" | |
320 "packuswb %%mm7, %%mm6 \n\t" | |
321 "movq %%mm0, (%0) \n\t" | |
322 "movq %%mm2, (%0, %1) \n\t" | |
323 "movq %%mm4, (%0, %1, 2) \n\t" | |
324 "movq %%mm6, (%0, %2) \n\t" | |
325 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p) | |
326 :"memory"); | |
0 | 327 } |
328 | |
3089 | 329 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) = |
1985
b2bc62fdecc0
move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents:
1984
diff
changeset
|
330 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; |
b2bc62fdecc0
move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents:
1984
diff
changeset
|
331 |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
332 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
333 { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
334 int i; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
335 |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
336 movq_m2r(*vector128, mm1); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
337 for (i = 0; i < 8; i++) { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
338 movq_m2r(*(block), mm0); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
339 packsswb_m2r(*(block + 4), mm0); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
340 block += 8; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
341 paddb_r2r(mm1, mm0); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
342 movq_r2m(mm0, *pixels); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
343 pixels += line_size; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
344 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
345 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
346 |
1064 | 347 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
0 | 348 { |
349 const DCTELEM *p; | |
1064 | 350 uint8_t *pix; |
0 | 351 int i; |
352 | |
353 /* read the pixels */ | |
354 p = block; | |
355 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
356 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
357 i = 4; |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
358 do { |
2979 | 359 __asm __volatile( |
360 "movq (%2), %%mm0 \n\t" | |
361 "movq 8(%2), %%mm1 \n\t" | |
362 "movq 16(%2), %%mm2 \n\t" | |
363 "movq 24(%2), %%mm3 \n\t" | |
364 "movq %0, %%mm4 \n\t" | |
365 "movq %1, %%mm6 \n\t" | |
366 "movq %%mm4, %%mm5 \n\t" | |
367 "punpcklbw %%mm7, %%mm4 \n\t" | |
368 "punpckhbw %%mm7, %%mm5 \n\t" | |
369 "paddsw %%mm4, %%mm0 \n\t" | |
370 "paddsw %%mm5, %%mm1 \n\t" | |
371 "movq %%mm6, %%mm5 \n\t" | |
372 "punpcklbw %%mm7, %%mm6 \n\t" | |
373 "punpckhbw %%mm7, %%mm5 \n\t" | |
374 "paddsw %%mm6, %%mm2 \n\t" | |
375 "paddsw %%mm5, %%mm3 \n\t" | |
376 "packuswb %%mm1, %%mm0 \n\t" | |
377 "packuswb %%mm3, %%mm2 \n\t" | |
378 "movq %%mm0, %0 \n\t" | |
379 "movq %%mm2, %1 \n\t" | |
380 :"+m"(*pix), "+m"(*(pix+line_size)) | |
381 :"r"(p) | |
382 :"memory"); | |
0 | 383 pix += line_size*2; |
384 p += 16; | |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
385 } while (--i); |
0 | 386 } |
387 | |
2209 | 388 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
389 { | |
390 __asm __volatile( | |
2979 | 391 "lea (%3, %3), %%"REG_a" \n\t" |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3574
diff
changeset
|
392 ASMALIGN(3) |
2979 | 393 "1: \n\t" |
394 "movd (%1), %%mm0 \n\t" | |
395 "movd (%1, %3), %%mm1 \n\t" | |
396 "movd %%mm0, (%2) \n\t" | |
397 "movd %%mm1, (%2, %3) \n\t" | |
398 "add %%"REG_a", %1 \n\t" | |
399 "add %%"REG_a", %2 \n\t" | |
400 "movd (%1), %%mm0 \n\t" | |
401 "movd (%1, %3), %%mm1 \n\t" | |
402 "movd %%mm0, (%2) \n\t" | |
403 "movd %%mm1, (%2, %3) \n\t" | |
404 "add %%"REG_a", %1 \n\t" | |
405 "add %%"REG_a", %2 \n\t" | |
406 "subl $4, %0 \n\t" | |
407 "jnz 1b \n\t" | |
408 : "+g"(h), "+r" (pixels), "+r" (block) | |
409 : "r"((long)line_size) | |
410 : "%"REG_a, "memory" | |
411 ); | |
2209 | 412 } |
413 | |
1064 | 414 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 415 { |
471 | 416 __asm __volatile( |
2979 | 417 "lea (%3, %3), %%"REG_a" \n\t" |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3574
diff
changeset
|
418 ASMALIGN(3) |
2979 | 419 "1: \n\t" |
420 "movq (%1), %%mm0 \n\t" | |
421 "movq (%1, %3), %%mm1 \n\t" | |
422 "movq %%mm0, (%2) \n\t" | |
423 "movq %%mm1, (%2, %3) \n\t" | |
424 "add %%"REG_a", %1 \n\t" | |
425 "add %%"REG_a", %2 \n\t" | |
426 "movq (%1), %%mm0 \n\t" | |
427 "movq (%1, %3), %%mm1 \n\t" | |
428 "movq %%mm0, (%2) \n\t" | |
429 "movq %%mm1, (%2, %3) \n\t" | |
430 "add %%"REG_a", %1 \n\t" | |
431 "add %%"REG_a", %2 \n\t" | |
432 "subl $4, %0 \n\t" | |
433 "jnz 1b \n\t" | |
434 : "+g"(h), "+r" (pixels), "+r" (block) | |
435 : "r"((long)line_size) | |
436 : "%"REG_a, "memory" | |
437 ); | |
0 | 438 } |
439 | |
1064 | 440 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
651 | 441 { |
442 __asm __volatile( | |
2979 | 443 "lea (%3, %3), %%"REG_a" \n\t" |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3574
diff
changeset
|
444 ASMALIGN(3) |
2979 | 445 "1: \n\t" |
446 "movq (%1), %%mm0 \n\t" | |
447 "movq 8(%1), %%mm4 \n\t" | |
448 "movq (%1, %3), %%mm1 \n\t" | |
449 "movq 8(%1, %3), %%mm5 \n\t" | |
450 "movq %%mm0, (%2) \n\t" | |
451 "movq %%mm4, 8(%2) \n\t" | |
452 "movq %%mm1, (%2, %3) \n\t" | |
453 "movq %%mm5, 8(%2, %3) \n\t" | |
454 "add %%"REG_a", %1 \n\t" | |
455 "add %%"REG_a", %2 \n\t" | |
456 "movq (%1), %%mm0 \n\t" | |
457 "movq 8(%1), %%mm4 \n\t" | |
458 "movq (%1, %3), %%mm1 \n\t" | |
459 "movq 8(%1, %3), %%mm5 \n\t" | |
460 "movq %%mm0, (%2) \n\t" | |
461 "movq %%mm4, 8(%2) \n\t" | |
462 "movq %%mm1, (%2, %3) \n\t" | |
463 "movq %%mm5, 8(%2, %3) \n\t" | |
464 "add %%"REG_a", %1 \n\t" | |
465 "add %%"REG_a", %2 \n\t" | |
466 "subl $4, %0 \n\t" | |
467 "jnz 1b \n\t" | |
468 : "+g"(h), "+r" (pixels), "+r" (block) | |
469 : "r"((long)line_size) | |
470 : "%"REG_a, "memory" | |
471 ); | |
651 | 472 } |
473 | |
296 | 474 static void clear_blocks_mmx(DCTELEM *blocks) |
475 { | |
471 | 476 __asm __volatile( |
2979 | 477 "pxor %%mm7, %%mm7 \n\t" |
478 "mov $-128*6, %%"REG_a" \n\t" | |
479 "1: \n\t" | |
480 "movq %%mm7, (%0, %%"REG_a") \n\t" | |
481 "movq %%mm7, 8(%0, %%"REG_a") \n\t" | |
482 "movq %%mm7, 16(%0, %%"REG_a") \n\t" | |
483 "movq %%mm7, 24(%0, %%"REG_a") \n\t" | |
484 "add $32, %%"REG_a" \n\t" | |
485 " js 1b \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
486 : : "r" (((uint8_t *)blocks)+128*6) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
487 : "%"REG_a |
296 | 488 ); |
489 } | |
490 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
491 #ifdef CONFIG_ENCODERS |
1064 | 492 static int pix_sum16_mmx(uint8_t * pix, int line_size){ |
688 | 493 const int h=16; |
494 int sum; | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
495 long index= -line_size*h; |
688 | 496 |
497 __asm __volatile( | |
2979 | 498 "pxor %%mm7, %%mm7 \n\t" |
499 "pxor %%mm6, %%mm6 \n\t" | |
500 "1: \n\t" | |
501 "movq (%2, %1), %%mm0 \n\t" | |
502 "movq (%2, %1), %%mm1 \n\t" | |
503 "movq 8(%2, %1), %%mm2 \n\t" | |
504 "movq 8(%2, %1), %%mm3 \n\t" | |
505 "punpcklbw %%mm7, %%mm0 \n\t" | |
506 "punpckhbw %%mm7, %%mm1 \n\t" | |
507 "punpcklbw %%mm7, %%mm2 \n\t" | |
508 "punpckhbw %%mm7, %%mm3 \n\t" | |
509 "paddw %%mm0, %%mm1 \n\t" | |
510 "paddw %%mm2, %%mm3 \n\t" | |
511 "paddw %%mm1, %%mm3 \n\t" | |
512 "paddw %%mm3, %%mm6 \n\t" | |
513 "add %3, %1 \n\t" | |
514 " js 1b \n\t" | |
515 "movq %%mm6, %%mm5 \n\t" | |
516 "psrlq $32, %%mm6 \n\t" | |
517 "paddw %%mm5, %%mm6 \n\t" | |
518 "movq %%mm6, %%mm5 \n\t" | |
519 "psrlq $16, %%mm6 \n\t" | |
520 "paddw %%mm5, %%mm6 \n\t" | |
521 "movd %%mm6, %0 \n\t" | |
522 "andl $0xFFFF, %0 \n\t" | |
688 | 523 : "=&r" (sum), "+r" (index) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
524 : "r" (pix - index), "r" ((long)line_size) |
688 | 525 ); |
526 | |
527 return sum; | |
528 } | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
529 #endif //CONFIG_ENCODERS |
688 | 530 |
866 | 531 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
532 long i=0; |
866 | 533 asm volatile( |
2979 | 534 "1: \n\t" |
535 "movq (%1, %0), %%mm0 \n\t" | |
536 "movq (%2, %0), %%mm1 \n\t" | |
537 "paddb %%mm0, %%mm1 \n\t" | |
538 "movq %%mm1, (%2, %0) \n\t" | |
539 "movq 8(%1, %0), %%mm0 \n\t" | |
540 "movq 8(%2, %0), %%mm1 \n\t" | |
541 "paddb %%mm0, %%mm1 \n\t" | |
542 "movq %%mm1, 8(%2, %0) \n\t" | |
543 "add $16, %0 \n\t" | |
544 "cmp %3, %0 \n\t" | |
545 " jb 1b \n\t" | |
866 | 546 : "+r" (i) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
547 : "r"(src), "r"(dst), "r"((long)w-15) |
866 | 548 ); |
549 for(; i<w; i++) | |
550 dst[i+0] += src[i+0]; | |
551 } | |
552 | |
1648 | 553 #define H263_LOOP_FILTER \ |
2979 | 554 "pxor %%mm7, %%mm7 \n\t"\ |
555 "movq %0, %%mm0 \n\t"\ | |
556 "movq %0, %%mm1 \n\t"\ | |
557 "movq %3, %%mm2 \n\t"\ | |
558 "movq %3, %%mm3 \n\t"\ | |
559 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
560 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
561 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
562 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
563 "psubw %%mm2, %%mm0 \n\t"\ | |
564 "psubw %%mm3, %%mm1 \n\t"\ | |
565 "movq %1, %%mm2 \n\t"\ | |
566 "movq %1, %%mm3 \n\t"\ | |
567 "movq %2, %%mm4 \n\t"\ | |
568 "movq %2, %%mm5 \n\t"\ | |
569 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
570 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
571 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
572 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
573 "psubw %%mm2, %%mm4 \n\t"\ | |
574 "psubw %%mm3, %%mm5 \n\t"\ | |
575 "psllw $2, %%mm4 \n\t"\ | |
576 "psllw $2, %%mm5 \n\t"\ | |
577 "paddw %%mm0, %%mm4 \n\t"\ | |
578 "paddw %%mm1, %%mm5 \n\t"\ | |
579 "pxor %%mm6, %%mm6 \n\t"\ | |
580 "pcmpgtw %%mm4, %%mm6 \n\t"\ | |
581 "pcmpgtw %%mm5, %%mm7 \n\t"\ | |
582 "pxor %%mm6, %%mm4 \n\t"\ | |
583 "pxor %%mm7, %%mm5 \n\t"\ | |
584 "psubw %%mm6, %%mm4 \n\t"\ | |
585 "psubw %%mm7, %%mm5 \n\t"\ | |
586 "psrlw $3, %%mm4 \n\t"\ | |
587 "psrlw $3, %%mm5 \n\t"\ | |
588 "packuswb %%mm5, %%mm4 \n\t"\ | |
589 "packsswb %%mm7, %%mm6 \n\t"\ | |
590 "pxor %%mm7, %%mm7 \n\t"\ | |
591 "movd %4, %%mm2 \n\t"\ | |
592 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
593 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
594 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
595 "psubusb %%mm4, %%mm2 \n\t"\ | |
596 "movq %%mm2, %%mm3 \n\t"\ | |
597 "psubusb %%mm4, %%mm3 \n\t"\ | |
598 "psubb %%mm3, %%mm2 \n\t"\ | |
599 "movq %1, %%mm3 \n\t"\ | |
600 "movq %2, %%mm4 \n\t"\ | |
601 "pxor %%mm6, %%mm3 \n\t"\ | |
602 "pxor %%mm6, %%mm4 \n\t"\ | |
603 "paddusb %%mm2, %%mm3 \n\t"\ | |
604 "psubusb %%mm2, %%mm4 \n\t"\ | |
605 "pxor %%mm6, %%mm3 \n\t"\ | |
606 "pxor %%mm6, %%mm4 \n\t"\ | |
607 "paddusb %%mm2, %%mm2 \n\t"\ | |
608 "packsswb %%mm1, %%mm0 \n\t"\ | |
609 "pcmpgtb %%mm0, %%mm7 \n\t"\ | |
610 "pxor %%mm7, %%mm0 \n\t"\ | |
611 "psubb %%mm7, %%mm0 \n\t"\ | |
612 "movq %%mm0, %%mm1 \n\t"\ | |
613 "psubusb %%mm2, %%mm0 \n\t"\ | |
614 "psubb %%mm0, %%mm1 \n\t"\ | |
615 "pand %5, %%mm1 \n\t"\ | |
616 "psrlw $2, %%mm1 \n\t"\ | |
617 "pxor %%mm7, %%mm1 \n\t"\ | |
618 "psubb %%mm7, %%mm1 \n\t"\ | |
619 "movq %0, %%mm5 \n\t"\ | |
620 "movq %3, %%mm6 \n\t"\ | |
621 "psubb %%mm1, %%mm5 \n\t"\ | |
622 "paddb %%mm1, %%mm6 \n\t" | |
1648 | 623 |
1647 | 624 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ |
5394
e9a6215f4e3a
help some gcc version to optimize out those functions
aurel
parents:
5278
diff
changeset
|
625 if(ENABLE_ANY_H263) { |
1647 | 626 const int strength= ff_h263_loop_filter_strength[qscale]; |
627 | |
628 asm volatile( | |
2967 | 629 |
1648 | 630 H263_LOOP_FILTER |
2967 | 631 |
2979 | 632 "movq %%mm3, %1 \n\t" |
633 "movq %%mm4, %2 \n\t" | |
634 "movq %%mm5, %0 \n\t" | |
635 "movq %%mm6, %3 \n\t" | |
1647 | 636 : "+m" (*(uint64_t*)(src - 2*stride)), |
637 "+m" (*(uint64_t*)(src - 1*stride)), | |
638 "+m" (*(uint64_t*)(src + 0*stride)), | |
639 "+m" (*(uint64_t*)(src + 1*stride)) | |
640 : "g" (2*strength), "m"(ff_pb_FC) | |
641 ); | |
5394
e9a6215f4e3a
help some gcc version to optimize out those functions
aurel
parents:
5278
diff
changeset
|
642 } |
1647 | 643 } |
644 | |
1648 | 645 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ |
646 asm volatile( //FIXME could save 1 instruction if done as 8x4 ... | |
2979 | 647 "movd %4, %%mm0 \n\t" |
648 "movd %5, %%mm1 \n\t" | |
649 "movd %6, %%mm2 \n\t" | |
650 "movd %7, %%mm3 \n\t" | |
651 "punpcklbw %%mm1, %%mm0 \n\t" | |
652 "punpcklbw %%mm3, %%mm2 \n\t" | |
653 "movq %%mm0, %%mm1 \n\t" | |
654 "punpcklwd %%mm2, %%mm0 \n\t" | |
655 "punpckhwd %%mm2, %%mm1 \n\t" | |
656 "movd %%mm0, %0 \n\t" | |
657 "punpckhdq %%mm0, %%mm0 \n\t" | |
658 "movd %%mm0, %1 \n\t" | |
659 "movd %%mm1, %2 \n\t" | |
660 "punpckhdq %%mm1, %%mm1 \n\t" | |
661 "movd %%mm1, %3 \n\t" | |
2967 | 662 |
1648 | 663 : "=m" (*(uint32_t*)(dst + 0*dst_stride)), |
664 "=m" (*(uint32_t*)(dst + 1*dst_stride)), | |
665 "=m" (*(uint32_t*)(dst + 2*dst_stride)), | |
666 "=m" (*(uint32_t*)(dst + 3*dst_stride)) | |
667 : "m" (*(uint32_t*)(src + 0*src_stride)), | |
668 "m" (*(uint32_t*)(src + 1*src_stride)), | |
669 "m" (*(uint32_t*)(src + 2*src_stride)), | |
670 "m" (*(uint32_t*)(src + 3*src_stride)) | |
671 ); | |
672 } | |
673 | |
674 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ | |
5394
e9a6215f4e3a
help some gcc version to optimize out those functions
aurel
parents:
5278
diff
changeset
|
675 if(ENABLE_ANY_H263) { |
1648 | 676 const int strength= ff_h263_loop_filter_strength[qscale]; |
677 uint64_t temp[4] __attribute__ ((aligned(8))); | |
678 uint8_t *btemp= (uint8_t*)temp; | |
2967 | 679 |
1648 | 680 src -= 2; |
681 | |
682 transpose4x4(btemp , src , 8, stride); | |
683 transpose4x4(btemp+4, src + 4*stride, 8, stride); | |
684 asm volatile( | |
685 H263_LOOP_FILTER // 5 3 4 6 | |
2967 | 686 |
1648 | 687 : "+m" (temp[0]), |
688 "+m" (temp[1]), | |
689 "+m" (temp[2]), | |
690 "+m" (temp[3]) | |
691 : "g" (2*strength), "m"(ff_pb_FC) | |
692 ); | |
693 | |
694 asm volatile( | |
2979 | 695 "movq %%mm5, %%mm1 \n\t" |
696 "movq %%mm4, %%mm0 \n\t" | |
697 "punpcklbw %%mm3, %%mm5 \n\t" | |
698 "punpcklbw %%mm6, %%mm4 \n\t" | |
699 "punpckhbw %%mm3, %%mm1 \n\t" | |
700 "punpckhbw %%mm6, %%mm0 \n\t" | |
701 "movq %%mm5, %%mm3 \n\t" | |
702 "movq %%mm1, %%mm6 \n\t" | |
703 "punpcklwd %%mm4, %%mm5 \n\t" | |
704 "punpcklwd %%mm0, %%mm1 \n\t" | |
705 "punpckhwd %%mm4, %%mm3 \n\t" | |
706 "punpckhwd %%mm0, %%mm6 \n\t" | |
707 "movd %%mm5, (%0) \n\t" | |
708 "punpckhdq %%mm5, %%mm5 \n\t" | |
709 "movd %%mm5, (%0,%2) \n\t" | |
710 "movd %%mm3, (%0,%2,2) \n\t" | |
711 "punpckhdq %%mm3, %%mm3 \n\t" | |
712 "movd %%mm3, (%0,%3) \n\t" | |
713 "movd %%mm1, (%1) \n\t" | |
714 "punpckhdq %%mm1, %%mm1 \n\t" | |
715 "movd %%mm1, (%1,%2) \n\t" | |
716 "movd %%mm6, (%1,%2,2) \n\t" | |
717 "punpckhdq %%mm6, %%mm6 \n\t" | |
718 "movd %%mm6, (%1,%3) \n\t" | |
2505
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
719 :: "r" (src), |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
720 "r" (src + 4*stride), |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
721 "r" ((long) stride ), |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
722 "r" ((long)(3*stride)) |
1648 | 723 ); |
5394
e9a6215f4e3a
help some gcc version to optimize out those functions
aurel
parents:
5278
diff
changeset
|
724 } |
1648 | 725 } |
726 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
727 #ifdef CONFIG_ENCODERS |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
728 static int pix_norm1_mmx(uint8_t *pix, int line_size) { |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
729 int tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
730 asm volatile ( |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
731 "movl $16,%%ecx\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
732 "pxor %%mm0,%%mm0\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
733 "pxor %%mm7,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
734 "1:\n" |
2979 | 735 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ |
736 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
737 |
2979 | 738 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
739 |
2979 | 740 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ |
741 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
742 |
2979 | 743 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ |
744 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ | |
745 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
746 |
2979 | 747 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ |
748 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
749 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
750 "pmaddwd %%mm3,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
751 "pmaddwd %%mm4,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
752 |
2979 | 753 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, |
754 pix2^2+pix3^2+pix6^2+pix7^2) */ | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
755 "paddd %%mm3,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
756 "paddd %%mm2,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
757 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
758 "add %2, %0\n" |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
759 "paddd %%mm4,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
760 "dec %%ecx\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
761 "jnz 1b\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
762 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
763 "movq %%mm7,%%mm1\n" |
2979 | 764 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
765 "paddd %%mm7,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
766 "movd %%mm1,%1\n" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
767 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" ); |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
768 return tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
769 } |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
770 |
2067 | 771 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
772 int tmp; | |
773 asm volatile ( | |
774 "movl %4,%%ecx\n" | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
775 "shr $1,%%ecx\n" |
2979 | 776 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ |
777 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ | |
2067 | 778 "1:\n" |
2979 | 779 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */ |
780 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */ | |
781 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */ | |
782 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */ | |
2067 | 783 |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
784 /* todo: mm1-mm2, mm3-mm4 */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
785 /* algo: substract mm1 from mm2 with saturation and vice versa */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
786 /* OR the results to get absolute difference */ |
2067 | 787 "movq %%mm1,%%mm5\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
788 "movq %%mm3,%%mm6\n" |
2067 | 789 "psubusb %%mm2,%%mm1\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
790 "psubusb %%mm4,%%mm3\n" |
2067 | 791 "psubusb %%mm5,%%mm2\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
792 "psubusb %%mm6,%%mm4\n" |
2067 | 793 |
794 "por %%mm1,%%mm2\n" | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
795 "por %%mm3,%%mm4\n" |
2067 | 796 |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
797 /* now convert to 16-bit vectors so we can square them */ |
2067 | 798 "movq %%mm2,%%mm1\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
799 "movq %%mm4,%%mm3\n" |
2067 | 800 |
801 "punpckhbw %%mm0,%%mm2\n" | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
802 "punpckhbw %%mm0,%%mm4\n" |
2979 | 803 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ |
804 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ | |
2067 | 805 |
806 "pmaddwd %%mm2,%%mm2\n" | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
807 "pmaddwd %%mm4,%%mm4\n" |
2067 | 808 "pmaddwd %%mm1,%%mm1\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
809 "pmaddwd %%mm3,%%mm3\n" |
2067 | 810 |
2979 | 811 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ |
812 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ | |
2067 | 813 |
814 "paddd %%mm2,%%mm1\n" | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
815 "paddd %%mm4,%%mm3\n" |
2067 | 816 "paddd %%mm1,%%mm7\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
817 "paddd %%mm3,%%mm7\n" |
2067 | 818 |
819 "decl %%ecx\n" | |
820 "jnz 1b\n" | |
821 | |
822 "movq %%mm7,%%mm1\n" | |
2979 | 823 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
2067 | 824 "paddd %%mm7,%%mm1\n" |
825 "movd %%mm1,%2\n" | |
2967 | 826 : "+r" (pix1), "+r" (pix2), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
827 : "r" ((long)line_size) , "m" (h) |
2067 | 828 : "%ecx"); |
829 return tmp; | |
830 } | |
831 | |
1708 | 832 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
833 int tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
834 asm volatile ( |
1708 | 835 "movl %4,%%ecx\n" |
2979 | 836 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ |
837 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
838 "1:\n" |
2979 | 839 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ |
840 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ | |
841 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ | |
842 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
843 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
844 /* todo: mm1-mm2, mm3-mm4 */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
845 /* algo: substract mm1 from mm2 with saturation and vice versa */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
846 /* OR the results to get absolute difference */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
847 "movq %%mm1,%%mm5\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
848 "movq %%mm3,%%mm6\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
849 "psubusb %%mm2,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
850 "psubusb %%mm4,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
851 "psubusb %%mm5,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
852 "psubusb %%mm6,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
853 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
854 "por %%mm1,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
855 "por %%mm3,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
856 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
857 /* now convert to 16-bit vectors so we can square them */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
858 "movq %%mm2,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
859 "movq %%mm4,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
860 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
861 "punpckhbw %%mm0,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
862 "punpckhbw %%mm0,%%mm4\n" |
2979 | 863 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ |
864 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
865 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
866 "pmaddwd %%mm2,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
867 "pmaddwd %%mm4,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
868 "pmaddwd %%mm1,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
869 "pmaddwd %%mm3,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
870 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
871 "add %3,%0\n" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
872 "add %3,%1\n" |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
873 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
874 "paddd %%mm2,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
875 "paddd %%mm4,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
876 "paddd %%mm1,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
877 "paddd %%mm3,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
878 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
879 "decl %%ecx\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
880 "jnz 1b\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
881 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
882 "movq %%mm7,%%mm1\n" |
2979 | 883 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
884 "paddd %%mm7,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
885 "movd %%mm1,%2\n" |
2967 | 886 : "+r" (pix1), "+r" (pix2), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
887 : "r" ((long)line_size) , "m" (h) |
1708 | 888 : "%ecx"); |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
889 return tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
890 } |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
891 |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
892 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
893 int tmp; |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
894 asm volatile ( |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
895 "shr $1,%2\n" |
2979 | 896 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */ |
897 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */ | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
898 "1:\n" |
2979 | 899 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */ |
900 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */ | |
901 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */ | |
902 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */ | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
903 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
904 /* todo: mm1-mm2, mm3-mm4 */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
905 /* algo: substract mm1 from mm2 with saturation and vice versa */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
906 /* OR the results to get absolute difference */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
907 "movdqa %%xmm1,%%xmm5\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
908 "movdqa %%xmm3,%%xmm6\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
909 "psubusb %%xmm2,%%xmm1\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
910 "psubusb %%xmm4,%%xmm3\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
911 "psubusb %%xmm5,%%xmm2\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
912 "psubusb %%xmm6,%%xmm4\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
913 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
914 "por %%xmm1,%%xmm2\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
915 "por %%xmm3,%%xmm4\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
916 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
917 /* now convert to 16-bit vectors so we can square them */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
918 "movdqa %%xmm2,%%xmm1\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
919 "movdqa %%xmm4,%%xmm3\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
920 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
921 "punpckhbw %%xmm0,%%xmm2\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
922 "punpckhbw %%xmm0,%%xmm4\n" |
2979 | 923 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */ |
924 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */ | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
925 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
926 "pmaddwd %%xmm2,%%xmm2\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
927 "pmaddwd %%xmm4,%%xmm4\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
928 "pmaddwd %%xmm1,%%xmm1\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
929 "pmaddwd %%xmm3,%%xmm3\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
930 |
2979 | 931 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */ |
932 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */ | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
933 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
934 "paddd %%xmm2,%%xmm1\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
935 "paddd %%xmm4,%%xmm3\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
936 "paddd %%xmm1,%%xmm7\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
937 "paddd %%xmm3,%%xmm7\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
938 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
939 "decl %2\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
940 "jnz 1b\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
941 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
942 "movdqa %%xmm7,%%xmm1\n" |
2979 | 943 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */ |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
944 "paddd %%xmm1,%%xmm7\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
945 "movdqa %%xmm7,%%xmm1\n" |
2979 | 946 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */ |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
947 "paddd %%xmm1,%%xmm7\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
948 "movd %%xmm7,%3\n" |
2967 | 949 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
950 : "r" ((long)line_size)); |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
951 return tmp; |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
952 } |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
953 |
2067 | 954 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { |
955 int tmp; | |
956 asm volatile ( | |
957 "movl %3,%%ecx\n" | |
958 "pxor %%mm7,%%mm7\n" | |
959 "pxor %%mm6,%%mm6\n" | |
2967 | 960 |
2067 | 961 "movq (%0),%%mm0\n" |
962 "movq %%mm0, %%mm1\n" | |
963 "psllq $8, %%mm0\n" | |
964 "psrlq $8, %%mm1\n" | |
965 "psrlq $8, %%mm0\n" | |
966 "movq %%mm0, %%mm2\n" | |
967 "movq %%mm1, %%mm3\n" | |
968 "punpcklbw %%mm7,%%mm0\n" | |
969 "punpcklbw %%mm7,%%mm1\n" | |
970 "punpckhbw %%mm7,%%mm2\n" | |
971 "punpckhbw %%mm7,%%mm3\n" | |
972 "psubw %%mm1, %%mm0\n" | |
973 "psubw %%mm3, %%mm2\n" | |
2967 | 974 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
975 "add %2,%0\n" |
2967 | 976 |
2067 | 977 "movq (%0),%%mm4\n" |
978 "movq %%mm4, %%mm1\n" | |
979 "psllq $8, %%mm4\n" | |
980 "psrlq $8, %%mm1\n" | |
981 "psrlq $8, %%mm4\n" | |
982 "movq %%mm4, %%mm5\n" | |
983 "movq %%mm1, %%mm3\n" | |
984 "punpcklbw %%mm7,%%mm4\n" | |
985 "punpcklbw %%mm7,%%mm1\n" | |
986 "punpckhbw %%mm7,%%mm5\n" | |
987 "punpckhbw %%mm7,%%mm3\n" | |
988 "psubw %%mm1, %%mm4\n" | |
989 "psubw %%mm3, %%mm5\n" | |
990 "psubw %%mm4, %%mm0\n" | |
991 "psubw %%mm5, %%mm2\n" | |
992 "pxor %%mm3, %%mm3\n" | |
993 "pxor %%mm1, %%mm1\n" | |
994 "pcmpgtw %%mm0, %%mm3\n\t" | |
995 "pcmpgtw %%mm2, %%mm1\n\t" | |
996 "pxor %%mm3, %%mm0\n" | |
997 "pxor %%mm1, %%mm2\n" | |
2967 | 998 "psubw %%mm3, %%mm0\n" |
2067 | 999 "psubw %%mm1, %%mm2\n" |
1000 "paddw %%mm0, %%mm2\n" | |
1001 "paddw %%mm2, %%mm6\n" | |
1002 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1003 "add %2,%0\n" |
2067 | 1004 "1:\n" |
2967 | 1005 |
2067 | 1006 "movq (%0),%%mm0\n" |
1007 "movq %%mm0, %%mm1\n" | |
1008 "psllq $8, %%mm0\n" | |
1009 "psrlq $8, %%mm1\n" | |
1010 "psrlq $8, %%mm0\n" | |
1011 "movq %%mm0, %%mm2\n" | |
1012 "movq %%mm1, %%mm3\n" | |
1013 "punpcklbw %%mm7,%%mm0\n" | |
1014 "punpcklbw %%mm7,%%mm1\n" | |
1015 "punpckhbw %%mm7,%%mm2\n" | |
1016 "punpckhbw %%mm7,%%mm3\n" | |
1017 "psubw %%mm1, %%mm0\n" | |
1018 "psubw %%mm3, %%mm2\n" | |
1019 "psubw %%mm0, %%mm4\n" | |
1020 "psubw %%mm2, %%mm5\n" | |
1021 "pxor %%mm3, %%mm3\n" | |
1022 "pxor %%mm1, %%mm1\n" | |
1023 "pcmpgtw %%mm4, %%mm3\n\t" | |
1024 "pcmpgtw %%mm5, %%mm1\n\t" | |
1025 "pxor %%mm3, %%mm4\n" | |
1026 "pxor %%mm1, %%mm5\n" | |
2967 | 1027 "psubw %%mm3, %%mm4\n" |
2067 | 1028 "psubw %%mm1, %%mm5\n" |
1029 "paddw %%mm4, %%mm5\n" | |
1030 "paddw %%mm5, %%mm6\n" | |
2967 | 1031 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1032 "add %2,%0\n" |
2967 | 1033 |
2067 | 1034 "movq (%0),%%mm4\n" |
1035 "movq %%mm4, %%mm1\n" | |
1036 "psllq $8, %%mm4\n" | |
1037 "psrlq $8, %%mm1\n" | |
1038 "psrlq $8, %%mm4\n" | |
1039 "movq %%mm4, %%mm5\n" | |
1040 "movq %%mm1, %%mm3\n" | |
1041 "punpcklbw %%mm7,%%mm4\n" | |
1042 "punpcklbw %%mm7,%%mm1\n" | |
1043 "punpckhbw %%mm7,%%mm5\n" | |
1044 "punpckhbw %%mm7,%%mm3\n" | |
1045 "psubw %%mm1, %%mm4\n" | |
1046 "psubw %%mm3, %%mm5\n" | |
1047 "psubw %%mm4, %%mm0\n" | |
1048 "psubw %%mm5, %%mm2\n" | |
1049 "pxor %%mm3, %%mm3\n" | |
1050 "pxor %%mm1, %%mm1\n" | |
1051 "pcmpgtw %%mm0, %%mm3\n\t" | |
1052 "pcmpgtw %%mm2, %%mm1\n\t" | |
1053 "pxor %%mm3, %%mm0\n" | |
1054 "pxor %%mm1, %%mm2\n" | |
2967 | 1055 "psubw %%mm3, %%mm0\n" |
2067 | 1056 "psubw %%mm1, %%mm2\n" |
1057 "paddw %%mm0, %%mm2\n" | |
1058 "paddw %%mm2, %%mm6\n" | |
1059 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1060 "add %2,%0\n" |
2067 | 1061 "subl $2, %%ecx\n" |
1062 " jnz 1b\n" | |
1063 | |
1064 "movq %%mm6, %%mm0\n" | |
1065 "punpcklwd %%mm7,%%mm0\n" | |
1066 "punpckhwd %%mm7,%%mm6\n" | |
1067 "paddd %%mm0, %%mm6\n" | |
2967 | 1068 |
2067 | 1069 "movq %%mm6,%%mm0\n" |
1070 "psrlq $32, %%mm6\n" | |
1071 "paddd %%mm6,%%mm0\n" | |
1072 "movd %%mm0,%1\n" | |
2967 | 1073 : "+r" (pix1), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1074 : "r" ((long)line_size) , "g" (h-2) |
2067 | 1075 : "%ecx"); |
1076 return tmp; | |
1077 } | |
1078 | |
1079 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { | |
1080 int tmp; | |
1081 uint8_t * pix= pix1; | |
1082 asm volatile ( | |
1083 "movl %3,%%ecx\n" | |
1084 "pxor %%mm7,%%mm7\n" | |
1085 "pxor %%mm6,%%mm6\n" | |
2967 | 1086 |
2067 | 1087 "movq (%0),%%mm0\n" |
1088 "movq 1(%0),%%mm1\n" | |
1089 "movq %%mm0, %%mm2\n" | |
1090 "movq %%mm1, %%mm3\n" | |
1091 "punpcklbw %%mm7,%%mm0\n" | |
1092 "punpcklbw %%mm7,%%mm1\n" | |
1093 "punpckhbw %%mm7,%%mm2\n" | |
1094 "punpckhbw %%mm7,%%mm3\n" | |
1095 "psubw %%mm1, %%mm0\n" | |
1096 "psubw %%mm3, %%mm2\n" | |
2967 | 1097 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1098 "add %2,%0\n" |
2967 | 1099 |
2067 | 1100 "movq (%0),%%mm4\n" |
1101 "movq 1(%0),%%mm1\n" | |
1102 "movq %%mm4, %%mm5\n" | |
1103 "movq %%mm1, %%mm3\n" | |
1104 "punpcklbw %%mm7,%%mm4\n" | |
1105 "punpcklbw %%mm7,%%mm1\n" | |
1106 "punpckhbw %%mm7,%%mm5\n" | |
1107 "punpckhbw %%mm7,%%mm3\n" | |
1108 "psubw %%mm1, %%mm4\n" | |
1109 "psubw %%mm3, %%mm5\n" | |
1110 "psubw %%mm4, %%mm0\n" | |
1111 "psubw %%mm5, %%mm2\n" | |
1112 "pxor %%mm3, %%mm3\n" | |
1113 "pxor %%mm1, %%mm1\n" | |
1114 "pcmpgtw %%mm0, %%mm3\n\t" | |
1115 "pcmpgtw %%mm2, %%mm1\n\t" | |
1116 "pxor %%mm3, %%mm0\n" | |
1117 "pxor %%mm1, %%mm2\n" | |
2967 | 1118 "psubw %%mm3, %%mm0\n" |
2067 | 1119 "psubw %%mm1, %%mm2\n" |
1120 "paddw %%mm0, %%mm2\n" | |
1121 "paddw %%mm2, %%mm6\n" | |
1122 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1123 "add %2,%0\n" |
2067 | 1124 "1:\n" |
2967 | 1125 |
2067 | 1126 "movq (%0),%%mm0\n" |
1127 "movq 1(%0),%%mm1\n" | |
1128 "movq %%mm0, %%mm2\n" | |
1129 "movq %%mm1, %%mm3\n" | |
1130 "punpcklbw %%mm7,%%mm0\n" | |
1131 "punpcklbw %%mm7,%%mm1\n" | |
1132 "punpckhbw %%mm7,%%mm2\n" | |
1133 "punpckhbw %%mm7,%%mm3\n" | |
1134 "psubw %%mm1, %%mm0\n" | |
1135 "psubw %%mm3, %%mm2\n" | |
1136 "psubw %%mm0, %%mm4\n" | |
1137 "psubw %%mm2, %%mm5\n" | |
1138 "pxor %%mm3, %%mm3\n" | |
1139 "pxor %%mm1, %%mm1\n" | |
1140 "pcmpgtw %%mm4, %%mm3\n\t" | |
1141 "pcmpgtw %%mm5, %%mm1\n\t" | |
1142 "pxor %%mm3, %%mm4\n" | |
1143 "pxor %%mm1, %%mm5\n" | |
1144 "psubw %%mm3, %%mm4\n" | |
1145 "psubw %%mm1, %%mm5\n" | |
1146 "paddw %%mm4, %%mm5\n" | |
1147 "paddw %%mm5, %%mm6\n" | |
2967 | 1148 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1149 "add %2,%0\n" |
2967 | 1150 |
2067 | 1151 "movq (%0),%%mm4\n" |
1152 "movq 1(%0),%%mm1\n" | |
1153 "movq %%mm4, %%mm5\n" | |
1154 "movq %%mm1, %%mm3\n" | |
1155 "punpcklbw %%mm7,%%mm4\n" | |
1156 "punpcklbw %%mm7,%%mm1\n" | |
1157 "punpckhbw %%mm7,%%mm5\n" | |
1158 "punpckhbw %%mm7,%%mm3\n" | |
1159 "psubw %%mm1, %%mm4\n" | |
1160 "psubw %%mm3, %%mm5\n" | |
1161 "psubw %%mm4, %%mm0\n" | |
1162 "psubw %%mm5, %%mm2\n" | |
1163 "pxor %%mm3, %%mm3\n" | |
1164 "pxor %%mm1, %%mm1\n" | |
1165 "pcmpgtw %%mm0, %%mm3\n\t" | |
1166 "pcmpgtw %%mm2, %%mm1\n\t" | |
1167 "pxor %%mm3, %%mm0\n" | |
1168 "pxor %%mm1, %%mm2\n" | |
2967 | 1169 "psubw %%mm3, %%mm0\n" |
2067 | 1170 "psubw %%mm1, %%mm2\n" |
1171 "paddw %%mm0, %%mm2\n" | |
1172 "paddw %%mm2, %%mm6\n" | |
1173 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1174 "add %2,%0\n" |
2067 | 1175 "subl $2, %%ecx\n" |
1176 " jnz 1b\n" | |
1177 | |
1178 "movq %%mm6, %%mm0\n" | |
1179 "punpcklwd %%mm7,%%mm0\n" | |
1180 "punpckhwd %%mm7,%%mm6\n" | |
1181 "paddd %%mm0, %%mm6\n" | |
2967 | 1182 |
2067 | 1183 "movq %%mm6,%%mm0\n" |
1184 "psrlq $32, %%mm6\n" | |
1185 "paddd %%mm6,%%mm0\n" | |
1186 "movd %%mm0,%1\n" | |
2967 | 1187 : "+r" (pix1), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1188 : "r" ((long)line_size) , "g" (h-2) |
2067 | 1189 : "%ecx"); |
1190 return tmp + hf_noise8_mmx(pix+8, line_size, h); | |
1191 } | |
1192 | |
2864
95bac7109ff0
Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents:
2754
diff
changeset
|
1193 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
95bac7109ff0
Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents:
2754
diff
changeset
|
1194 MpegEncContext *c = p; |
2940 | 1195 int score1, score2; |
1196 | |
1197 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); | |
1198 else score1 = sse16_mmx(c, pix1, pix2, line_size, h); | |
1199 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); | |
2067 | 1200 |
4001 | 1201 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; |
1202 else return score1 + FFABS(score2)*8; | |
2067 | 1203 } |
1204 | |
2864
95bac7109ff0
Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents:
2754
diff
changeset
|
1205 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
95bac7109ff0
Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents:
2754
diff
changeset
|
1206 MpegEncContext *c = p; |
2067 | 1207 int score1= sse8_mmx(c, pix1, pix2, line_size, h); |
1208 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); | |
1209 | |
4001 | 1210 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; |
1211 else return score1 + FFABS(score2)*8; | |
2067 | 1212 } |
1213 | |
1729 | 1214 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { |
1215 int tmp; | |
2967 | 1216 |
1729 | 1217 assert( (((int)pix) & 7) == 0); |
1218 assert((line_size &7) ==0); | |
2967 | 1219 |
1729 | 1220 #define SUM(in0, in1, out0, out1) \ |
1221 "movq (%0), %%mm2\n"\ | |
1222 "movq 8(%0), %%mm3\n"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1223 "add %2,%0\n"\ |
1729 | 1224 "movq %%mm2, " #out0 "\n"\ |
1225 "movq %%mm3, " #out1 "\n"\ | |
1226 "psubusb " #in0 ", %%mm2\n"\ | |
1227 "psubusb " #in1 ", %%mm3\n"\ | |
1228 "psubusb " #out0 ", " #in0 "\n"\ | |
1229 "psubusb " #out1 ", " #in1 "\n"\ | |
1230 "por %%mm2, " #in0 "\n"\ | |
1231 "por %%mm3, " #in1 "\n"\ | |
1232 "movq " #in0 ", %%mm2\n"\ | |
1233 "movq " #in1 ", %%mm3\n"\ | |
1234 "punpcklbw %%mm7, " #in0 "\n"\ | |
1235 "punpcklbw %%mm7, " #in1 "\n"\ | |
1236 "punpckhbw %%mm7, %%mm2\n"\ | |
1237 "punpckhbw %%mm7, %%mm3\n"\ | |
1238 "paddw " #in1 ", " #in0 "\n"\ | |
1239 "paddw %%mm3, %%mm2\n"\ | |
1240 "paddw %%mm2, " #in0 "\n"\ | |
1241 "paddw " #in0 ", %%mm6\n" | |
1242 | |
2967 | 1243 |
1729 | 1244 asm volatile ( |
1245 "movl %3,%%ecx\n" | |
1246 "pxor %%mm6,%%mm6\n" | |
1247 "pxor %%mm7,%%mm7\n" | |
1248 "movq (%0),%%mm0\n" | |
1249 "movq 8(%0),%%mm1\n" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1250 "add %2,%0\n" |
1729 | 1251 "subl $2, %%ecx\n" |
1252 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1253 "1:\n" | |
2967 | 1254 |
1729 | 1255 SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
2967 | 1256 |
1729 | 1257 SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
2967 | 1258 |
1729 | 1259 "subl $2, %%ecx\n" |
1260 "jnz 1b\n" | |
1261 | |
1262 "movq %%mm6,%%mm0\n" | |
1263 "psrlq $32, %%mm6\n" | |
1264 "paddw %%mm6,%%mm0\n" | |
1265 "movq %%mm0,%%mm6\n" | |
1266 "psrlq $16, %%mm0\n" | |
1267 "paddw %%mm6,%%mm0\n" | |
1268 "movd %%mm0,%1\n" | |
2967 | 1269 : "+r" (pix), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1270 : "r" ((long)line_size) , "m" (h) |
1729 | 1271 : "%ecx"); |
1272 return tmp & 0xFFFF; | |
1273 } | |
1274 #undef SUM | |
1275 | |
1276 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { | |
1277 int tmp; | |
2967 | 1278 |
1729 | 1279 assert( (((int)pix) & 7) == 0); |
1280 assert((line_size &7) ==0); | |
2967 | 1281 |
1729 | 1282 #define SUM(in0, in1, out0, out1) \ |
1283 "movq (%0), " #out0 "\n"\ | |
1284 "movq 8(%0), " #out1 "\n"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1285 "add %2,%0\n"\ |
1729 | 1286 "psadbw " #out0 ", " #in0 "\n"\ |
1287 "psadbw " #out1 ", " #in1 "\n"\ | |
1288 "paddw " #in1 ", " #in0 "\n"\ | |
1289 "paddw " #in0 ", %%mm6\n" | |
1290 | |
1291 asm volatile ( | |
1292 "movl %3,%%ecx\n" | |
1293 "pxor %%mm6,%%mm6\n" | |
1294 "pxor %%mm7,%%mm7\n" | |
1295 "movq (%0),%%mm0\n" | |
1296 "movq 8(%0),%%mm1\n" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1297 "add %2,%0\n" |
1729 | 1298 "subl $2, %%ecx\n" |
1299 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1300 "1:\n" | |
2967 | 1301 |
1729 | 1302 SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
2967 | 1303 |
1729 | 1304 SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
2967 | 1305 |
1729 | 1306 "subl $2, %%ecx\n" |
1307 "jnz 1b\n" | |
1308 | |
1309 "movd %%mm6,%1\n" | |
2967 | 1310 : "+r" (pix), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1311 : "r" ((long)line_size) , "m" (h) |
1729 | 1312 : "%ecx"); |
1313 return tmp; | |
1314 } | |
1315 #undef SUM | |
1316 | |
1317 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
1318 int tmp; | |
2967 | 1319 |
1729 | 1320 assert( (((int)pix1) & 7) == 0); |
1321 assert( (((int)pix2) & 7) == 0); | |
1322 assert((line_size &7) ==0); | |
2967 | 1323 |
1729 | 1324 #define SUM(in0, in1, out0, out1) \ |
1325 "movq (%0),%%mm2\n"\ | |
1326 "movq (%1)," #out0 "\n"\ | |
1327 "movq 8(%0),%%mm3\n"\ | |
1328 "movq 8(%1)," #out1 "\n"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1329 "add %3,%0\n"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1330 "add %3,%1\n"\ |
1729 | 1331 "psubb " #out0 ", %%mm2\n"\ |
1332 "psubb " #out1 ", %%mm3\n"\ | |
1333 "pxor %%mm7, %%mm2\n"\ | |
1334 "pxor %%mm7, %%mm3\n"\ | |
1335 "movq %%mm2, " #out0 "\n"\ | |
1336 "movq %%mm3, " #out1 "\n"\ | |
1337 "psubusb " #in0 ", %%mm2\n"\ | |
1338 "psubusb " #in1 ", %%mm3\n"\ | |
1339 "psubusb " #out0 ", " #in0 "\n"\ | |
1340 "psubusb " #out1 ", " #in1 "\n"\ | |
1341 "por %%mm2, " #in0 "\n"\ | |
1342 "por %%mm3, " #in1 "\n"\ | |
1343 "movq " #in0 ", %%mm2\n"\ | |
1344 "movq " #in1 ", %%mm3\n"\ | |
1345 "punpcklbw %%mm7, " #in0 "\n"\ | |
1346 "punpcklbw %%mm7, " #in1 "\n"\ | |
1347 "punpckhbw %%mm7, %%mm2\n"\ | |
1348 "punpckhbw %%mm7, %%mm3\n"\ | |
1349 "paddw " #in1 ", " #in0 "\n"\ | |
1350 "paddw %%mm3, %%mm2\n"\ | |
1351 "paddw %%mm2, " #in0 "\n"\ | |
1352 "paddw " #in0 ", %%mm6\n" | |
1353 | |
2967 | 1354 |
1729 | 1355 asm volatile ( |
1356 "movl %4,%%ecx\n" | |
1357 "pxor %%mm6,%%mm6\n" | |
1358 "pcmpeqw %%mm7,%%mm7\n" | |
1359 "psllw $15, %%mm7\n" | |
1360 "packsswb %%mm7, %%mm7\n" | |
1361 "movq (%0),%%mm0\n" | |
1362 "movq (%1),%%mm2\n" | |
1363 "movq 8(%0),%%mm1\n" | |
1364 "movq 8(%1),%%mm3\n" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1365 "add %3,%0\n" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1366 "add %3,%1\n" |
1729 | 1367 "subl $2, %%ecx\n" |
1368 "psubb %%mm2, %%mm0\n" | |
1369 "psubb %%mm3, %%mm1\n" | |
1370 "pxor %%mm7, %%mm0\n" | |
1371 "pxor %%mm7, %%mm1\n" | |
1372 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1373 "1:\n" | |
2967 | 1374 |
1729 | 1375 SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
2967 | 1376 |
1729 | 1377 SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
2967 | 1378 |
1729 | 1379 "subl $2, %%ecx\n" |
1380 "jnz 1b\n" | |
1381 | |
1382 "movq %%mm6,%%mm0\n" | |
1383 "psrlq $32, %%mm6\n" | |
1384 "paddw %%mm6,%%mm0\n" | |
1385 "movq %%mm0,%%mm6\n" | |
1386 "psrlq $16, %%mm0\n" | |
1387 "paddw %%mm6,%%mm0\n" | |
1388 "movd %%mm0,%2\n" | |
2967 | 1389 : "+r" (pix1), "+r" (pix2), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1390 : "r" ((long)line_size) , "m" (h) |
1729 | 1391 : "%ecx"); |
1392 return tmp & 0x7FFF; | |
1393 } | |
1394 #undef SUM | |
1395 | |
1396 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
1397 int tmp; | |
2967 | 1398 |
1729 | 1399 assert( (((int)pix1) & 7) == 0); |
1400 assert( (((int)pix2) & 7) == 0); | |
1401 assert((line_size &7) ==0); | |
2967 | 1402 |
1729 | 1403 #define SUM(in0, in1, out0, out1) \ |
1404 "movq (%0)," #out0 "\n"\ | |
1405 "movq (%1),%%mm2\n"\ | |
1406 "movq 8(%0)," #out1 "\n"\ | |
1407 "movq 8(%1),%%mm3\n"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1408 "add %3,%0\n"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1409 "add %3,%1\n"\ |
1729 | 1410 "psubb %%mm2, " #out0 "\n"\ |
1411 "psubb %%mm3, " #out1 "\n"\ | |
1412 "pxor %%mm7, " #out0 "\n"\ | |
1413 "pxor %%mm7, " #out1 "\n"\ | |
1414 "psadbw " #out0 ", " #in0 "\n"\ | |
1415 "psadbw " #out1 ", " #in1 "\n"\ | |
1416 "paddw " #in1 ", " #in0 "\n"\ | |
1417 "paddw " #in0 ", %%mm6\n" | |
1418 | |
1419 asm volatile ( | |
1420 "movl %4,%%ecx\n" | |
1421 "pxor %%mm6,%%mm6\n" | |
1422 "pcmpeqw %%mm7,%%mm7\n" | |
1423 "psllw $15, %%mm7\n" | |
1424 "packsswb %%mm7, %%mm7\n" | |
1425 "movq (%0),%%mm0\n" | |
1426 "movq (%1),%%mm2\n" | |
1427 "movq 8(%0),%%mm1\n" | |
1428 "movq 8(%1),%%mm3\n" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1429 "add %3,%0\n" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1430 "add %3,%1\n" |
1729 | 1431 "subl $2, %%ecx\n" |
1432 "psubb %%mm2, %%mm0\n" | |
1433 "psubb %%mm3, %%mm1\n" | |
1434 "pxor %%mm7, %%mm0\n" | |
1435 "pxor %%mm7, %%mm1\n" | |
1436 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1437 "1:\n" | |
2967 | 1438 |
1729 | 1439 SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
2967 | 1440 |
1729 | 1441 SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
2967 | 1442 |
1729 | 1443 "subl $2, %%ecx\n" |
1444 "jnz 1b\n" | |
1445 | |
1446 "movd %%mm6,%2\n" | |
2967 | 1447 : "+r" (pix1), "+r" (pix2), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1448 : "r" ((long)line_size) , "m" (h) |
1729 | 1449 : "%ecx"); |
1450 return tmp; | |
1451 } | |
1452 #undef SUM | |
1453 | |
866 | 1454 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1455 long i=0; |
866 | 1456 asm volatile( |
2979 | 1457 "1: \n\t" |
1458 "movq (%2, %0), %%mm0 \n\t" | |
1459 "movq (%1, %0), %%mm1 \n\t" | |
1460 "psubb %%mm0, %%mm1 \n\t" | |
1461 "movq %%mm1, (%3, %0) \n\t" | |
1462 "movq 8(%2, %0), %%mm0 \n\t" | |
1463 "movq 8(%1, %0), %%mm1 \n\t" | |
1464 "psubb %%mm0, %%mm1 \n\t" | |
1465 "movq %%mm1, 8(%3, %0) \n\t" | |
1466 "add $16, %0 \n\t" | |
1467 "cmp %4, %0 \n\t" | |
1468 " jb 1b \n\t" | |
866 | 1469 : "+r" (i) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1470 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15) |
866 | 1471 ); |
1472 for(; i<w; i++) | |
1473 dst[i+0] = src1[i+0]-src2[i+0]; | |
1474 } | |
1527 | 1475 |
1476 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1477 long i=0; |
1527 | 1478 uint8_t l, lt; |
2967 | 1479 |
1527 | 1480 asm volatile( |
2979 | 1481 "1: \n\t" |
1482 "movq -1(%1, %0), %%mm0 \n\t" // LT | |
1483 "movq (%1, %0), %%mm1 \n\t" // T | |
1484 "movq -1(%2, %0), %%mm2 \n\t" // L | |
1485 "movq (%2, %0), %%mm3 \n\t" // X | |
1486 "movq %%mm2, %%mm4 \n\t" // L | |
1487 "psubb %%mm0, %%mm2 \n\t" | |
1488 "paddb %%mm1, %%mm2 \n\t" // L + T - LT | |
1489 "movq %%mm4, %%mm5 \n\t" // L | |
1490 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) | |
1491 "pminub %%mm5, %%mm1 \n\t" // min(T, L) | |
1492 "pminub %%mm2, %%mm4 \n\t" | |
1493 "pmaxub %%mm1, %%mm4 \n\t" | |
1494 "psubb %%mm4, %%mm3 \n\t" // dst - pred | |
1495 "movq %%mm3, (%3, %0) \n\t" | |
1496 "add $8, %0 \n\t" | |
1497 "cmp %4, %0 \n\t" | |
1498 " jb 1b \n\t" | |
1527 | 1499 : "+r" (i) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1500 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w) |
1527 | 1501 ); |
1502 | |
1503 l= *left; | |
1504 lt= *left_top; | |
2967 | 1505 |
1527 | 1506 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); |
2967 | 1507 |
1527 | 1508 *left_top= src1[w-1]; |
1509 *left = src2[w-1]; | |
1510 } | |
1511 | |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1512 #define DIFF_PIXELS_1(m,a,t,p1,p2)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1513 "mov"#m" "#p1", "#a" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1514 "mov"#m" "#p2", "#t" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1515 "punpcklbw "#a", "#t" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1516 "punpcklbw "#a", "#a" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1517 "psubw "#t", "#a" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1518 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1519 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1520 uint8_t *p1b=p1, *p2b=p2;\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1521 asm volatile(\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1522 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1523 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1524 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1525 "add %4, %1 \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1526 "add %4, %2 \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1527 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1528 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1529 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1530 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1531 "mov"#m1" "#mm"0, %0 \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1532 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1533 "mov"#m1" %0, "#mm"0 \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1534 : "=m"(temp), "+r"(p1b), "+r"(p2b)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1535 : "r"((long)stride), "r"((long)stride*3)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1536 );\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1537 } |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1538 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1539 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp) |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1540 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp) |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1541 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1542 #ifdef ARCH_X86_64 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1543 // permutes 01234567 -> 05736421 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1544 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1545 SBUTTERFLY(a,b,%%xmm8,wd,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1546 SBUTTERFLY(c,d,b,wd,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1547 SBUTTERFLY(e,f,d,wd,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1548 SBUTTERFLY(g,h,f,wd,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1549 SBUTTERFLY(a,c,h,dq,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1550 SBUTTERFLY(%%xmm8,b,c,dq,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1551 SBUTTERFLY(e,g,b,dq,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1552 SBUTTERFLY(d,f,g,dq,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1553 SBUTTERFLY(a,e,f,qdq,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1554 SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1555 SBUTTERFLY(h,b,d,qdq,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1556 SBUTTERFLY(c,g,b,qdq,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1557 "movdqa %%xmm8, "#g" \n\t" |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1558 #else |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1559 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1560 "movdqa "#h", "#t" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1561 SBUTTERFLY(a,b,h,wd,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1562 "movdqa "#h", 16"#t" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1563 "movdqa "#t", "#h" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1564 SBUTTERFLY(c,d,b,wd,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1565 SBUTTERFLY(e,f,d,wd,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1566 SBUTTERFLY(g,h,f,wd,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1567 SBUTTERFLY(a,c,h,dq,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1568 "movdqa "#h", "#t" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1569 "movdqa 16"#t", "#h" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1570 SBUTTERFLY(h,b,c,dq,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1571 SBUTTERFLY(e,g,b,dq,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1572 SBUTTERFLY(d,f,g,dq,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1573 SBUTTERFLY(a,e,f,qdq,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1574 SBUTTERFLY(h,d,e,qdq,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1575 "movdqa "#h", 16"#t" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1576 "movdqa "#t", "#h" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1577 SBUTTERFLY(h,b,d,qdq,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1578 SBUTTERFLY(c,g,b,qdq,dqa)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1579 "movdqa 16"#t", "#g" \n\t" |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1580 #endif |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1581 |
1153 | 1582 #define LBUTTERFLY2(a1,b1,a2,b2)\ |
2979 | 1583 "paddw " #b1 ", " #a1 " \n\t"\ |
1584 "paddw " #b2 ", " #a2 " \n\t"\ | |
1585 "paddw " #b1 ", " #b1 " \n\t"\ | |
1586 "paddw " #b2 ", " #b2 " \n\t"\ | |
1587 "psubw " #a1 ", " #b1 " \n\t"\ | |
1588 "psubw " #a2 ", " #b2 " \n\t" | |
866 | 1589 |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1590 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1591 LBUTTERFLY2(m0, m1, m2, m3)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1592 LBUTTERFLY2(m4, m5, m6, m7)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1593 LBUTTERFLY2(m0, m2, m1, m3)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1594 LBUTTERFLY2(m4, m6, m5, m7)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1595 LBUTTERFLY2(m0, m4, m1, m5)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1596 LBUTTERFLY2(m2, m6, m3, m7)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1597 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1598 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7) |
936 | 1599 |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1600 #define MMABS_MMX(a,z)\ |
2979 | 1601 "pxor " #z ", " #z " \n\t"\ |
1602 "pcmpgtw " #a ", " #z " \n\t"\ | |
1603 "pxor " #z ", " #a " \n\t"\ | |
1604 "psubw " #z ", " #a " \n\t" | |
936 | 1605 |
1153 | 1606 #define MMABS_MMX2(a,z)\ |
2979 | 1607 "pxor " #z ", " #z " \n\t"\ |
1608 "psubw " #a ", " #z " \n\t"\ | |
1609 "pmaxsw " #z ", " #a " \n\t" | |
1153 | 1610 |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1611 #define MMABS_SSSE3(a,z)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1612 "pabsw " #a ", " #a " \n\t" |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1613 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1614 #define MMABS_SUM(a,z, sum)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1615 MMABS(a,z)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1616 "paddusw " #a ", " #sum " \n\t" |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1617 |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1618 #define MMABS_SUM_8x8_NOSPILL\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1619 MMABS(%%xmm0, %%xmm8)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1620 MMABS(%%xmm1, %%xmm9)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1621 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1622 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1623 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1624 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1625 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1626 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1627 "paddusw %%xmm1, %%xmm0 \n\t" |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1628 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1629 #ifdef ARCH_X86_64 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1630 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1631 #else |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1632 #define MMABS_SUM_8x8_SSE2\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1633 "movdqa %%xmm7, (%1) \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1634 MMABS(%%xmm0, %%xmm7)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1635 MMABS(%%xmm1, %%xmm7)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1636 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1637 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1638 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1639 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1640 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1641 "movdqa (%1), %%xmm2 \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1642 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1643 "paddusw %%xmm1, %%xmm0 \n\t" |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1644 #endif |
2967 | 1645 |
936 | 1646 #define LOAD4(o, a, b, c, d)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1647 "movq "#o"(%1), "#a" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1648 "movq "#o"+8(%1), "#b" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1649 "movq "#o"+16(%1), "#c" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1650 "movq "#o"+24(%1), "#d" \n\t"\ |
936 | 1651 |
1652 #define STORE4(o, a, b, c, d)\ | |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1653 "movq "#a", "#o"(%1) \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1654 "movq "#b", "#o"+8(%1) \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1655 "movq "#c", "#o"+16(%1) \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1656 "movq "#d", "#o"+24(%1) \n\t"\ |
936 | 1657 |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1658 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1659 * about 100k on extreme inputs. But that's very unlikely to occur in natural video, |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1660 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1661 #define HSUM_MMX(a, t, dst)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1662 "movq "#a", "#t" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1663 "psrlq $32, "#a" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1664 "paddusw "#t", "#a" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1665 "movq "#a", "#t" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1666 "psrlq $16, "#a" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1667 "paddusw "#t", "#a" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1668 "movd "#a", "#dst" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1669 |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1670 #define HSUM_MMX2(a, t, dst)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1671 "pshufw $0x0E, "#a", "#t" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1672 "paddusw "#t", "#a" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1673 "pshufw $0x01, "#a", "#t" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1674 "paddusw "#t", "#a" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1675 "movd "#a", "#dst" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1676 |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1677 #define HSUM_SSE2(a, t, dst)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1678 "movhlps "#a", "#t" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1679 "paddusw "#t", "#a" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1680 "pshuflw $0x0E, "#a", "#t" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1681 "paddusw "#t", "#a" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1682 "pshuflw $0x01, "#a", "#t" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1683 "paddusw "#t", "#a" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1684 "movd "#a", "#dst" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1685 |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1686 #define HADAMARD8_DIFF_MMX(cpu) \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1687 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1688 DECLARE_ALIGNED_8(uint64_t, temp[13]);\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1689 int sum;\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1690 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1691 assert(h==8);\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1692 \ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1693 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1694 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1695 asm volatile(\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1696 HADAMARD48\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1697 \ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1698 "movq %%mm7, 96(%1) \n\t"\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1699 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1700 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1701 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1702 \ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1703 "movq 96(%1), %%mm7 \n\t"\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1704 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1705 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1706 \ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1707 : "=r" (sum)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1708 : "r"(temp)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1709 );\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1710 \ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1711 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1712 \ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1713 asm volatile(\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1714 HADAMARD48\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1715 \ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1716 "movq %%mm7, 96(%1) \n\t"\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1717 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1718 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1719 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1720 \ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1721 "movq 96(%1), %%mm7 \n\t"\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1722 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1723 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1724 "movq %%mm6, %%mm7 \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1725 "movq %%mm0, %%mm6 \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1726 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1727 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1728 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1729 HADAMARD48\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1730 "movq %%mm7, 64(%1) \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1731 MMABS(%%mm0, %%mm7)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1732 MMABS(%%mm1, %%mm7)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1733 MMABS_SUM(%%mm2, %%mm7, %%mm0)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1734 MMABS_SUM(%%mm3, %%mm7, %%mm1)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1735 MMABS_SUM(%%mm4, %%mm7, %%mm0)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1736 MMABS_SUM(%%mm5, %%mm7, %%mm1)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1737 MMABS_SUM(%%mm6, %%mm7, %%mm0)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1738 "movq 64(%1), %%mm2 \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1739 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1740 "paddusw %%mm1, %%mm0 \n\t"\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1741 "movq %%mm0, 64(%1) \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1742 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1743 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1744 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1745 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1746 HADAMARD48\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1747 "movq %%mm7, (%1) \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1748 MMABS(%%mm0, %%mm7)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1749 MMABS(%%mm1, %%mm7)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1750 MMABS_SUM(%%mm2, %%mm7, %%mm0)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1751 MMABS_SUM(%%mm3, %%mm7, %%mm1)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1752 MMABS_SUM(%%mm4, %%mm7, %%mm0)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1753 MMABS_SUM(%%mm5, %%mm7, %%mm1)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1754 MMABS_SUM(%%mm6, %%mm7, %%mm0)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1755 "movq (%1), %%mm2 \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1756 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1757 "paddusw 64(%1), %%mm0 \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1758 "paddusw %%mm1, %%mm0 \n\t"\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1759 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1760 HSUM(%%mm0, %%mm1, %0)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1761 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1762 : "=r" (sum)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1763 : "r"(temp)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1764 );\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1765 return sum&0xFFFF;\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1766 }\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1767 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1768 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1769 #define HADAMARD8_DIFF_SSE2(cpu) \ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1770 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1771 DECLARE_ALIGNED_16(uint64_t, temp[4]);\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1772 int sum;\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1773 \ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1774 assert(h==8);\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1775 \ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1776 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1777 \ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1778 asm volatile(\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1779 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1780 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1781 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1782 MMABS_SUM_8x8\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1783 HSUM_SSE2(%%xmm0, %%xmm1, %0)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1784 : "=r" (sum)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1785 : "r"(temp)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1786 );\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1787 return sum&0xFFFF;\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1788 }\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1789 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) |
936 | 1790 |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1791 #define MMABS(a,z) MMABS_MMX(a,z) |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1792 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst) |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1793 HADAMARD8_DIFF_MMX(mmx) |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1794 #undef MMABS |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1795 #undef HSUM |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1796 |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1797 #define MMABS(a,z) MMABS_MMX2(a,z) |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1798 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2 |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1799 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1800 HADAMARD8_DIFF_MMX(mmx2) |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1801 HADAMARD8_DIFF_SSE2(sse2) |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1802 #undef MMABS |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1803 #undef MMABS_SUM_8x8 |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1804 #undef HSUM |
1153 | 1805 |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1806 #ifdef HAVE_SSSE3 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1807 #define MMABS(a,z) MMABS_SSSE3(a,z) |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1808 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1809 HADAMARD8_DIFF_SSE2(ssse3) |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1810 #undef MMABS |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1811 #undef MMABS_SUM_8x8 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1812 #endif |
4749 | 1813 |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1814 #define DCT_SAD4(m,mm,o)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1815 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1816 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1817 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1818 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1819 MMABS_SUM(mm##2, mm##6, mm##0)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1820 MMABS_SUM(mm##3, mm##7, mm##1)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1821 MMABS_SUM(mm##4, mm##6, mm##0)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1822 MMABS_SUM(mm##5, mm##7, mm##1)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1823 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1824 #define DCT_SAD_MMX\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1825 "pxor %%mm0, %%mm0 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1826 "pxor %%mm1, %%mm1 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1827 DCT_SAD4(q, %%mm, 0)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1828 DCT_SAD4(q, %%mm, 8)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1829 DCT_SAD4(q, %%mm, 64)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1830 DCT_SAD4(q, %%mm, 72)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1831 "paddusw %%mm1, %%mm0 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1832 HSUM(%%mm0, %%mm1, %0) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1833 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1834 #define DCT_SAD_SSE2\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1835 "pxor %%xmm0, %%xmm0 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1836 "pxor %%xmm1, %%xmm1 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1837 DCT_SAD4(dqa, %%xmm, 0)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1838 DCT_SAD4(dqa, %%xmm, 64)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1839 "paddusw %%xmm1, %%xmm0 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1840 HSUM(%%xmm0, %%xmm1, %0) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1841 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1842 #define DCT_SAD_FUNC(cpu) \ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1843 static int sum_abs_dctelem_##cpu(DCTELEM *block){\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1844 int sum;\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1845 asm volatile(\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1846 DCT_SAD\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1847 :"=r"(sum)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1848 :"r"(block)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1849 );\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1850 return sum&0xFFFF;\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1851 } |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1852 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1853 #define DCT_SAD DCT_SAD_MMX |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1854 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1855 #define MMABS(a,z) MMABS_MMX(a,z) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1856 DCT_SAD_FUNC(mmx) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1857 #undef MMABS |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1858 #undef HSUM |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1859 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1860 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1861 #define MMABS(a,z) MMABS_MMX2(a,z) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1862 DCT_SAD_FUNC(mmx2) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1863 #undef HSUM |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1864 #undef DCT_SAD |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1865 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1866 #define DCT_SAD DCT_SAD_SSE2 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1867 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1868 DCT_SAD_FUNC(sse2) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1869 #undef MMABS |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1870 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1871 #ifdef HAVE_SSSE3 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1872 #define MMABS(a,z) MMABS_SSSE3(a,z) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1873 DCT_SAD_FUNC(ssse3) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1874 #undef MMABS |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1875 #endif |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1876 #undef HSUM |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1877 #undef DCT_SAD |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1878 |
5255 | 1879 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ |
4749 | 1880 int sum; |
1881 long i=size; | |
1882 asm volatile( | |
1883 "pxor %%mm4, %%mm4 \n" | |
1884 "1: \n" | |
1885 "sub $8, %0 \n" | |
1886 "movq (%2,%0), %%mm2 \n" | |
1887 "movq (%3,%0,2), %%mm0 \n" | |
1888 "movq 8(%3,%0,2), %%mm1 \n" | |
1889 "punpckhbw %%mm2, %%mm3 \n" | |
1890 "punpcklbw %%mm2, %%mm2 \n" | |
1891 "psraw $8, %%mm3 \n" | |
1892 "psraw $8, %%mm2 \n" | |
1893 "psubw %%mm3, %%mm1 \n" | |
1894 "psubw %%mm2, %%mm0 \n" | |
1895 "pmaddwd %%mm1, %%mm1 \n" | |
1896 "pmaddwd %%mm0, %%mm0 \n" | |
1897 "paddd %%mm1, %%mm4 \n" | |
1898 "paddd %%mm0, %%mm4 \n" | |
1899 "jg 1b \n" | |
1900 "movq %%mm4, %%mm3 \n" | |
1901 "psrlq $32, %%mm3 \n" | |
1902 "paddd %%mm3, %%mm4 \n" | |
1903 "movd %%mm4, %1 \n" | |
1904 :"+r"(i), "=r"(sum) | |
1905 :"r"(pix1), "r"(pix2) | |
1906 ); | |
1907 return sum; | |
1908 } | |
1909 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
1910 #endif //CONFIG_ENCODERS |
866 | 1911 |
959 | 1912 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d) |
1913 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d) | |
1914 | |
954 | 1915 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ |
2979 | 1916 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ |
1917 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ | |
1918 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ | |
1919 "movq "#in7", " #m3 " \n\t" /* d */\ | |
1920 "movq "#in0", %%mm5 \n\t" /* D */\ | |
1921 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\ | |
1922 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\ | |
1923 "movq "#in1", %%mm5 \n\t" /* C */\ | |
1924 "movq "#in2", %%mm6 \n\t" /* B */\ | |
1925 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\ | |
1926 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ | |
1927 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ | |
1928 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ | |
1929 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ | |
1930 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ | |
1931 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ | |
1932 "psraw $5, %%mm5 \n\t"\ | |
1933 "packuswb %%mm5, %%mm5 \n\t"\ | |
954 | 1934 OP(%%mm5, out, %%mm7, d) |
1935 | |
959 | 1936 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\ |
1057 | 1937 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
954 | 1938 uint64_t temp;\ |
1939 \ | |
1940 asm volatile(\ | |
2979 | 1941 "pxor %%mm7, %%mm7 \n\t"\ |
1942 "1: \n\t"\ | |
1943 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ | |
1944 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ | |
1945 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ | |
1946 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ | |
1947 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ | |
1948 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ | |
1949 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ | |
1950 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ | |
1951 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ | |
1952 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ | |
1953 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ | |
1954 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ | |
1955 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ | |
1956 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ | |
1957 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ | |
1958 "paddw %%mm3, %%mm5 \n\t" /* b */\ | |
1959 "paddw %%mm2, %%mm6 \n\t" /* c */\ | |
1960 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
1961 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ | |
1962 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ | |
1963 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ | |
1964 "paddw %%mm4, %%mm0 \n\t" /* a */\ | |
1965 "paddw %%mm1, %%mm5 \n\t" /* d */\ | |
1966 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ | |
1967 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ | |
1968 "paddw %6, %%mm6 \n\t"\ | |
1969 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ | |
1970 "psraw $5, %%mm0 \n\t"\ | |
1971 "movq %%mm0, %5 \n\t"\ | |
954 | 1972 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ |
1973 \ | |
2979 | 1974 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ |
1975 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\ | |
1976 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\ | |
1977 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\ | |
1978 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\ | |
1979 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\ | |
1980 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\ | |
1981 "paddw %%mm0, %%mm2 \n\t" /* b */\ | |
1982 "paddw %%mm5, %%mm3 \n\t" /* c */\ | |
1983 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ | |
1984 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ | |
1985 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\ | |
1986 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ | |
1987 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ | |
1988 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ | |
1989 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ | |
1990 "paddw %%mm2, %%mm1 \n\t" /* a */\ | |
1991 "paddw %%mm6, %%mm4 \n\t" /* d */\ | |
1992 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ | |
1993 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ | |
1994 "paddw %6, %%mm1 \n\t"\ | |
1995 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ | |
1996 "psraw $5, %%mm3 \n\t"\ | |
1997 "movq %5, %%mm1 \n\t"\ | |
1998 "packuswb %%mm3, %%mm1 \n\t"\ | |
959 | 1999 OP_MMX2(%%mm1, (%1),%%mm4, q)\ |
954 | 2000 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ |
2001 \ | |
2979 | 2002 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ |
2003 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ | |
2004 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ | |
2005 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\ | |
2006 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\ | |
2007 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\ | |
2008 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\ | |
2009 "paddw %%mm1, %%mm5 \n\t" /* b */\ | |
2010 "paddw %%mm4, %%mm0 \n\t" /* c */\ | |
2011 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
2012 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ | |
2013 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ | |
2014 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ | |
2015 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ | |
2016 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ | |
2017 "paddw %%mm3, %%mm2 \n\t" /* d */\ | |
2018 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ | |
2019 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\ | |
2020 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ | |
2021 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ | |
2022 "paddw %%mm2, %%mm6 \n\t" /* a */\ | |
2023 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ | |
2024 "paddw %6, %%mm0 \n\t"\ | |
2025 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ | |
2026 "psraw $5, %%mm0 \n\t"\ | |
954 | 2027 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ |
2028 \ | |
2979 | 2029 "paddw %%mm5, %%mm3 \n\t" /* a */\ |
2030 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\ | |
2031 "paddw %%mm4, %%mm6 \n\t" /* b */\ | |
2032 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\ | |
2033 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\ | |
2034 "paddw %%mm1, %%mm4 \n\t" /* c */\ | |
2035 "paddw %%mm2, %%mm5 \n\t" /* d */\ | |
2036 "paddw %%mm6, %%mm6 \n\t" /* 2b */\ | |
2037 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ | |
2038 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ | |
2039 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ | |
2040 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ | |
2041 "paddw %6, %%mm4 \n\t"\ | |
2042 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ | |
2043 "psraw $5, %%mm4 \n\t"\ | |
2044 "packuswb %%mm4, %%mm0 \n\t"\ | |
959 | 2045 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ |
954 | 2046 \ |
2979 | 2047 "add %3, %0 \n\t"\ |
2048 "add %4, %1 \n\t"\ | |
2049 "decl %2 \n\t"\ | |
2050 " jnz 1b \n\t"\ | |
967 | 2051 : "+a"(src), "+c"(dst), "+m"(h)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2052 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
966 | 2053 : "memory"\ |
954 | 2054 );\ |
2055 }\ | |
2056 \ | |
2057 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
2058 int i;\ | |
2059 int16_t temp[16];\ | |
2060 /* quick HACK, XXX FIXME MUST be optimized */\ | |
2061 for(i=0; i<h; i++)\ | |
2062 {\ | |
2063 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ | |
2064 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ | |
2065 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ | |
2066 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ | |
2067 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ | |
2068 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\ | |
2069 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\ | |
2070 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\ | |
2071 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\ | |
2072 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\ | |
2073 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\ | |
2074 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\ | |
2075 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\ | |
2076 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\ | |
2077 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\ | |
2078 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\ | |
2079 asm volatile(\ | |
2979 | 2080 "movq (%0), %%mm0 \n\t"\ |
2081 "movq 8(%0), %%mm1 \n\t"\ | |
2082 "paddw %2, %%mm0 \n\t"\ | |
2083 "paddw %2, %%mm1 \n\t"\ | |
2084 "psraw $5, %%mm0 \n\t"\ | |
2085 "psraw $5, %%mm1 \n\t"\ | |
2086 "packuswb %%mm1, %%mm0 \n\t"\ | |
959 | 2087 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ |
2979 | 2088 "movq 16(%0), %%mm0 \n\t"\ |
2089 "movq 24(%0), %%mm1 \n\t"\ | |
2090 "paddw %2, %%mm0 \n\t"\ | |
2091 "paddw %2, %%mm1 \n\t"\ | |
2092 "psraw $5, %%mm0 \n\t"\ | |
2093 "psraw $5, %%mm1 \n\t"\ | |
2094 "packuswb %%mm1, %%mm0 \n\t"\ | |
959 | 2095 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\ |
954 | 2096 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ |
966 | 2097 : "memory"\ |
954 | 2098 );\ |
2099 dst+=dstStride;\ | |
2100 src+=srcStride;\ | |
2101 }\ | |
2102 }\ | |
2103 \ | |
1057 | 2104 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
959 | 2105 uint64_t temp;\ |
2106 \ | |
2107 asm volatile(\ | |
2979 | 2108 "pxor %%mm7, %%mm7 \n\t"\ |
2109 "1: \n\t"\ | |
2110 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ | |
2111 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ | |
2112 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ | |
2113 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ | |
2114 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ | |
2115 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ | |
2116 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ | |
2117 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ | |
2118 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ | |
2119 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ | |
2120 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ | |
2121 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ | |
2122 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ | |
2123 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ | |
2124 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ | |
2125 "paddw %%mm3, %%mm5 \n\t" /* b */\ | |
2126 "paddw %%mm2, %%mm6 \n\t" /* c */\ | |
2127 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
2128 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ | |
2129 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ | |
2130 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ | |
2131 "paddw %%mm4, %%mm0 \n\t" /* a */\ | |
2132 "paddw %%mm1, %%mm5 \n\t" /* d */\ | |
2133 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ | |
2134 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ | |
2135 "paddw %6, %%mm6 \n\t"\ | |
2136 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ | |
2137 "psraw $5, %%mm0 \n\t"\ | |
959 | 2138 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ |
2139 \ | |
2979 | 2140 "movd 5(%0), %%mm5 \n\t" /* FGHI */\ |
2141 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\ | |
2142 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\ | |
2143 "paddw %%mm5, %%mm1 \n\t" /* a */\ | |
2144 "paddw %%mm6, %%mm2 \n\t" /* b */\ | |
2145 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\ | |
2146 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\ | |
2147 "paddw %%mm6, %%mm3 \n\t" /* c */\ | |
2148 "paddw %%mm5, %%mm4 \n\t" /* d */\ | |
2149 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ | |
2150 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ | |
2151 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ | |
2152 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ | |
2153 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ | |
2154 "paddw %6, %%mm1 \n\t"\ | |
2155 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ | |
2156 "psraw $5, %%mm3 \n\t"\ | |
2157 "packuswb %%mm3, %%mm0 \n\t"\ | |
959 | 2158 OP_MMX2(%%mm0, (%1), %%mm4, q)\ |
2159 \ | |
2979 | 2160 "add %3, %0 \n\t"\ |
2161 "add %4, %1 \n\t"\ | |
2162 "decl %2 \n\t"\ | |
2163 " jnz 1b \n\t"\ | |
967 | 2164 : "+a"(src), "+c"(dst), "+m"(h)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2165 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
966 | 2166 : "memory"\ |
959 | 2167 );\ |
2168 }\ | |
2169 \ | |
2170 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
2171 int i;\ | |
2172 int16_t temp[8];\ | |
2173 /* quick HACK, XXX FIXME MUST be optimized */\ | |
2174 for(i=0; i<h; i++)\ | |
2175 {\ | |
2176 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ | |
2177 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ | |
2178 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ | |
2179 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ | |
2180 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ | |
2181 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\ | |
2182 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\ | |
2183 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\ | |
2184 asm volatile(\ | |
2979 | 2185 "movq (%0), %%mm0 \n\t"\ |
2186 "movq 8(%0), %%mm1 \n\t"\ | |
2187 "paddw %2, %%mm0 \n\t"\ | |
2188 "paddw %2, %%mm1 \n\t"\ | |
2189 "psraw $5, %%mm0 \n\t"\ | |
2190 "psraw $5, %%mm1 \n\t"\ | |
2191 "packuswb %%mm1, %%mm0 \n\t"\ | |
959 | 2192 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ |
2193 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ | |
966 | 2194 :"memory"\ |
959 | 2195 );\ |
2196 dst+=dstStride;\ | |
2197 src+=srcStride;\ | |
2198 }\ | |
2199 } | |
2200 | |
2201 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ | |
2202 \ | |
2203 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
954 | 2204 uint64_t temp[17*4];\ |
2205 uint64_t *temp_ptr= temp;\ | |
2206 int count= 17;\ | |
2207 \ | |
2208 /*FIXME unroll */\ | |
2209 asm volatile(\ | |
2979 | 2210 "pxor %%mm7, %%mm7 \n\t"\ |
2211 "1: \n\t"\ | |
2212 "movq (%0), %%mm0 \n\t"\ | |
2213 "movq (%0), %%mm1 \n\t"\ | |
2214 "movq 8(%0), %%mm2 \n\t"\ | |
2215 "movq 8(%0), %%mm3 \n\t"\ | |
2216 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2217 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
2218 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
2219 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
2220 "movq %%mm0, (%1) \n\t"\ | |
2221 "movq %%mm1, 17*8(%1) \n\t"\ | |
2222 "movq %%mm2, 2*17*8(%1) \n\t"\ | |
2223 "movq %%mm3, 3*17*8(%1) \n\t"\ | |
2224 "add $8, %1 \n\t"\ | |
2225 "add %3, %0 \n\t"\ | |
2226 "decl %2 \n\t"\ | |
2227 " jnz 1b \n\t"\ | |
954 | 2228 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2229 : "r" ((long)srcStride)\ |
966 | 2230 : "memory"\ |
954 | 2231 );\ |
2232 \ | |
2233 temp_ptr= temp;\ | |
2234 count=4;\ | |
2235 \ | |
2236 /*FIXME reorder for speed */\ | |
2237 asm volatile(\ | |
2979 | 2238 /*"pxor %%mm7, %%mm7 \n\t"*/\ |
2239 "1: \n\t"\ | |
2240 "movq (%0), %%mm0 \n\t"\ | |
2241 "movq 8(%0), %%mm1 \n\t"\ | |
2242 "movq 16(%0), %%mm2 \n\t"\ | |
2243 "movq 24(%0), %%mm3 \n\t"\ | |
961 | 2244 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
2245 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ | |
2979 | 2246 "add %4, %1 \n\t"\ |
961 | 2247 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
954 | 2248 \ |
961 | 2249 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
2979 | 2250 "add %4, %1 \n\t"\ |
961 | 2251 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
2252 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ | |
2979 | 2253 "add %4, %1 \n\t"\ |
961 | 2254 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ |
2255 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ | |
2979 | 2256 "add %4, %1 \n\t"\ |
961 | 2257 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ |
2258 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ | |
2979 | 2259 "add %4, %1 \n\t"\ |
961 | 2260 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ |
2261 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ | |
2979 | 2262 "add %4, %1 \n\t"\ |
961 | 2263 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ |
954 | 2264 \ |
961 | 2265 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ |
2979 | 2266 "add %4, %1 \n\t" \ |
961 | 2267 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ |
2268 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ | |
954 | 2269 \ |
2979 | 2270 "add $136, %0 \n\t"\ |
2271 "add %6, %1 \n\t"\ | |
2272 "decl %2 \n\t"\ | |
2273 " jnz 1b \n\t"\ | |
958
9bb668034ecf
slowdown / gcc 2.95.* bug workaround (this should be reversed as soon as gcc 2.95.* support is droped)
michaelni
parents:
954
diff
changeset
|
2274 \ |
967 | 2275 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2276 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\ |
966 | 2277 :"memory"\ |
954 | 2278 );\ |
2279 }\ | |
2280 \ | |
1057 | 2281 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2209 | 2282 uint64_t temp[9*2];\ |
954 | 2283 uint64_t *temp_ptr= temp;\ |
2284 int count= 9;\ | |
2285 \ | |
2286 /*FIXME unroll */\ | |
2287 asm volatile(\ | |
2979 | 2288 "pxor %%mm7, %%mm7 \n\t"\ |
2289 "1: \n\t"\ | |
2290 "movq (%0), %%mm0 \n\t"\ | |
2291 "movq (%0), %%mm1 \n\t"\ | |
2292 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2293 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
2294 "movq %%mm0, (%1) \n\t"\ | |
2295 "movq %%mm1, 9*8(%1) \n\t"\ | |
2296 "add $8, %1 \n\t"\ | |
2297 "add %3, %0 \n\t"\ | |
2298 "decl %2 \n\t"\ | |
2299 " jnz 1b \n\t"\ | |
954 | 2300 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2301 : "r" ((long)srcStride)\ |
966 | 2302 : "memory"\ |
954 | 2303 );\ |
2304 \ | |
2305 temp_ptr= temp;\ | |
2306 count=2;\ | |
2307 \ | |
2308 /*FIXME reorder for speed */\ | |
2309 asm volatile(\ | |
2979 | 2310 /*"pxor %%mm7, %%mm7 \n\t"*/\ |
2311 "1: \n\t"\ | |
2312 "movq (%0), %%mm0 \n\t"\ | |
2313 "movq 8(%0), %%mm1 \n\t"\ | |
2314 "movq 16(%0), %%mm2 \n\t"\ | |
2315 "movq 24(%0), %%mm3 \n\t"\ | |
961 | 2316 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
2317 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ | |
2979 | 2318 "add %4, %1 \n\t"\ |
961 | 2319 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
954 | 2320 \ |
961 | 2321 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
2979 | 2322 "add %4, %1 \n\t"\ |
961 | 2323 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
954 | 2324 \ |
961 | 2325 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ |
2979 | 2326 "add %4, %1 \n\t"\ |
961 | 2327 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ |
2328 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ | |
954 | 2329 \ |
2979 | 2330 "add $72, %0 \n\t"\ |
2331 "add %6, %1 \n\t"\ | |
2332 "decl %2 \n\t"\ | |
2333 " jnz 1b \n\t"\ | |
954 | 2334 \ |
961 | 2335 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2336 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\ |
966 | 2337 : "memory"\ |
2338 );\ | |
959 | 2339 }\ |
954 | 2340 \ |
1064 | 2341 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ |
959 | 2342 OPNAME ## pixels8_mmx(dst, src, stride, 8);\ |
954 | 2343 }\ |
2344 \ | |
1064 | 2345 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2346 uint64_t temp[8];\ |
954 | 2347 uint8_t * const half= (uint8_t*)temp;\ |
2348 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2349 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ |
954 | 2350 }\ |
2351 \ | |
1064 | 2352 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2353 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ |
2354 }\ | |
2355 \ | |
1064 | 2356 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2357 uint64_t temp[8];\ |
954 | 2358 uint8_t * const half= (uint8_t*)temp;\ |
2359 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2360 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\ |
954 | 2361 }\ |
2362 \ | |
1064 | 2363 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2364 uint64_t temp[8];\ |
954 | 2365 uint8_t * const half= (uint8_t*)temp;\ |
959 | 2366 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2367 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ |
954 | 2368 }\ |
2369 \ | |
1064 | 2370 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
959 | 2371 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ |
954 | 2372 }\ |
2373 \ | |
1064 | 2374 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2375 uint64_t temp[8];\ |
954 | 2376 uint8_t * const half= (uint8_t*)temp;\ |
959 | 2377 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2378 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\ |
954 | 2379 }\ |
1064 | 2380 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2381 uint64_t half[8 + 9];\ |
2382 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
2383 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2384 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2385 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ |
959 | 2386 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2387 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ |
954 | 2388 }\ |
1064 | 2389 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2390 uint64_t half[8 + 9];\ |
2391 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
2392 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2393 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2394 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ |
959 | 2395 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2396 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ |
954 | 2397 }\ |
1064 | 2398 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2399 uint64_t half[8 + 9];\ |
2400 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
2401 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2402 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2403 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ |
959 | 2404 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2405 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ |
954 | 2406 }\ |
1064 | 2407 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2408 uint64_t half[8 + 9];\ |
2409 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
2410 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2411 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2412 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ |
959 | 2413 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2414 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ |
954 | 2415 }\ |
1064 | 2416 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2417 uint64_t half[8 + 9];\ |
954 | 2418 uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
2419 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2420 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
959 | 2421 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2422 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ |
954 | 2423 }\ |
1064 | 2424 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2425 uint64_t half[8 + 9];\ |
954 | 2426 uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
2427 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2428 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
959 | 2429 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2430 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ |
954 | 2431 }\ |
1064 | 2432 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2433 uint64_t half[8 + 9];\ |
2434 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 2435 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2436 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ |
984 | 2437 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
954 | 2438 }\ |
1064 | 2439 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2440 uint64_t half[8 + 9];\ |
2441 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 2442 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2443 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ |
984 | 2444 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
954 | 2445 }\ |
1064 | 2446 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2447 uint64_t half[9];\ |
954 | 2448 uint8_t * const halfH= ((uint8_t*)half);\ |
2449 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
959 | 2450 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
954 | 2451 }\ |
1064 | 2452 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ |
959 | 2453 OPNAME ## pixels16_mmx(dst, src, stride, 16);\ |
954 | 2454 }\ |
2455 \ | |
1064 | 2456 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2457 uint64_t temp[32];\ |
2458 uint8_t * const half= (uint8_t*)temp;\ | |
2459 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2460 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ |
954 | 2461 }\ |
2462 \ | |
1064 | 2463 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2464 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ |
2465 }\ | |
2466 \ | |
1064 | 2467 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2468 uint64_t temp[32];\ |
2469 uint8_t * const half= (uint8_t*)temp;\ | |
2470 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2471 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\ |
954 | 2472 }\ |
2473 \ | |
1064 | 2474 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2475 uint64_t temp[32];\ |
2476 uint8_t * const half= (uint8_t*)temp;\ | |
959 | 2477 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2478 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ |
954 | 2479 }\ |
2480 \ | |
1064 | 2481 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
959 | 2482 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ |
954 | 2483 }\ |
2484 \ | |
1064 | 2485 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2486 uint64_t temp[32];\ |
2487 uint8_t * const half= (uint8_t*)temp;\ | |
959 | 2488 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2489 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\ |
954 | 2490 }\ |
1064 | 2491 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2492 uint64_t half[16*2 + 17*2];\ |
2493 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2494 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2495 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2496 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ |
959 | 2497 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2498 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ |
954 | 2499 }\ |
1064 | 2500 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2501 uint64_t half[16*2 + 17*2];\ |
2502 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2503 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2504 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2505 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ |
959 | 2506 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2507 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ |
954 | 2508 }\ |
1064 | 2509 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2510 uint64_t half[16*2 + 17*2];\ |
2511 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2512 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2513 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2514 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ |
959 | 2515 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2516 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ |
954 | 2517 }\ |
1064 | 2518 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2519 uint64_t half[16*2 + 17*2];\ |
2520 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2521 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2522 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2523 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ |
959 | 2524 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2525 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ |
954 | 2526 }\ |
1064 | 2527 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2528 uint64_t half[16*2 + 17*2];\ |
2529 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2530 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2531 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
959 | 2532 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2533 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ |
954 | 2534 }\ |
1064 | 2535 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2536 uint64_t half[16*2 + 17*2];\ |
2537 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2538 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2539 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
959 | 2540 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2541 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ |
954 | 2542 }\ |
1064 | 2543 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2544 uint64_t half[17*2];\ |
2545 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 2546 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2547 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ |
984 | 2548 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
954 | 2549 }\ |
1064 | 2550 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2551 uint64_t half[17*2];\ |
2552 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 2553 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2554 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ |
984 | 2555 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
954 | 2556 }\ |
1064 | 2557 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2558 uint64_t half[17*2];\ |
2559 uint8_t * const halfH= ((uint8_t*)half);\ | |
2560 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
959 | 2561 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
954 | 2562 } |
2563 | |
2979 | 2564 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" |
959 | 2565 #define AVG_3DNOW_OP(a,b,temp, size) \ |
2979 | 2566 "mov" #size " " #b ", " #temp " \n\t"\ |
2567 "pavgusb " #temp ", " #a " \n\t"\ | |
2568 "mov" #size " " #a ", " #b " \n\t" | |
959 | 2569 #define AVG_MMX2_OP(a,b,temp, size) \ |
2979 | 2570 "mov" #size " " #b ", " #temp " \n\t"\ |
2571 "pavgb " #temp ", " #a " \n\t"\ | |
2572 "mov" #size " " #a ", " #b " \n\t" | |
959 | 2573 |
2574 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP) | |
2575 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP) | |
2576 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) | |
2577 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow) | |
2578 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) | |
2579 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) | |
954 | 2580 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) |
959 | 2581 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) |
954 | 2582 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) |
2583 | |
3807
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2584 /***********************************/ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2585 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2586 |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2587 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2588 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2589 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2590 } |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2591 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2592 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2593 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2594 } |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2595 |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2596 #define QPEL_2TAP(OPNAME, SIZE, MMX)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2597 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2598 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2599 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2600 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2601 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2602 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2603 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2604 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2605 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2606 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2607 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2608 }\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2609 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2610 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2611 }\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2612 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2613 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2614 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2615 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2616 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2617 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2618 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2619 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2620 |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2621 QPEL_2TAP(put_, 16, mmx2) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2622 QPEL_2TAP(avg_, 16, mmx2) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2623 QPEL_2TAP(put_, 8, mmx2) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2624 QPEL_2TAP(avg_, 8, mmx2) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2625 QPEL_2TAP(put_, 16, 3dnow) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2626 QPEL_2TAP(avg_, 16, 3dnow) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2627 QPEL_2TAP(put_, 8, 3dnow) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2628 QPEL_2TAP(avg_, 8, 3dnow) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2629 |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2630 |
393 | 2631 #if 0 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2632 static void just_return() { return; } |
393 | 2633 #endif |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2634 |
954 | 2635 #define SET_QPEL_FUNC(postfix1, postfix2) \ |
2636 c->put_ ## postfix1 = put_ ## postfix2;\ | |
2637 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\ | |
2638 c->avg_ ## postfix1 = avg_ ## postfix2; | |
1092 | 2639 |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2640 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2641 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){ |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2642 const int w = 8; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2643 const int ix = ox>>(16+shift); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2644 const int iy = oy>>(16+shift); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2645 const int oxs = ox>>4; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2646 const int oys = oy>>4; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2647 const int dxxs = dxx>>4; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2648 const int dxys = dxy>>4; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2649 const int dyxs = dyx>>4; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2650 const int dyys = dyy>>4; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2651 const uint16_t r4[4] = {r,r,r,r}; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2652 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys}; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2653 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys}; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2654 const uint64_t shift2 = 2*shift; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2655 uint8_t edge_buf[(h+1)*stride]; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2656 int x, y; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2657 |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2658 const int dxw = (dxx-(1<<(16+shift)))*(w-1); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2659 const int dyh = (dyy-(1<<(16+shift)))*(h-1); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2660 const int dxh = dxy*(h-1); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2661 const int dyw = dyx*(w-1); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2662 if( // non-constant fullpel offset (3% of blocks) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2663 (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) | |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2664 oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2665 // uses more than 16 bits of subpel mv (only at huge resolution) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2666 || (dxx|dxy|dyx|dyy)&15 ) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2667 { |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2668 //FIXME could still use mmx for some of the rows |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2669 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2670 return; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2671 } |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2672 |
3250 | 2673 src += ix + iy*stride; |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2674 if( (unsigned)ix >= width-w || |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2675 (unsigned)iy >= height-h ) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2676 { |
3250 | 2677 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height); |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2678 src = edge_buf; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2679 } |
3250 | 2680 |
2681 asm volatile( | |
2682 "movd %0, %%mm6 \n\t" | |
2683 "pxor %%mm7, %%mm7 \n\t" | |
2684 "punpcklwd %%mm6, %%mm6 \n\t" | |
2685 "punpcklwd %%mm6, %%mm6 \n\t" | |
2686 :: "r"(1<<shift) | |
2687 ); | |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2688 |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2689 for(x=0; x<w; x+=4){ |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2690 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0), |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2691 oxs - dxys + dxxs*(x+1), |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2692 oxs - dxys + dxxs*(x+2), |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2693 oxs - dxys + dxxs*(x+3) }; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2694 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0), |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2695 oys - dyys + dyxs*(x+1), |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2696 oys - dyys + dyxs*(x+2), |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2697 oys - dyys + dyxs*(x+3) }; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2698 |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2699 for(y=0; y<h; y++){ |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2700 asm volatile( |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2701 "movq %0, %%mm4 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2702 "movq %1, %%mm5 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2703 "paddw %2, %%mm4 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2704 "paddw %3, %%mm5 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2705 "movq %%mm4, %0 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2706 "movq %%mm5, %1 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2707 "psrlw $12, %%mm4 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2708 "psrlw $12, %%mm5 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2709 : "+m"(*dx4), "+m"(*dy4) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2710 : "m"(*dxy4), "m"(*dyy4) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2711 ); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2712 |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2713 asm volatile( |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2714 "movq %%mm6, %%mm2 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2715 "movq %%mm6, %%mm1 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2716 "psubw %%mm4, %%mm2 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2717 "psubw %%mm5, %%mm1 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2718 "movq %%mm2, %%mm0 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2719 "movq %%mm4, %%mm3 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2720 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2721 "pmullw %%mm5, %%mm3 \n\t" // dx*dy |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2722 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2723 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2724 |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2725 "movd %4, %%mm5 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2726 "movd %3, %%mm4 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2727 "punpcklbw %%mm7, %%mm5 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2728 "punpcklbw %%mm7, %%mm4 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2729 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2730 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2731 |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2732 "movd %2, %%mm5 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2733 "movd %1, %%mm4 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2734 "punpcklbw %%mm7, %%mm5 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2735 "punpcklbw %%mm7, %%mm4 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2736 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2737 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy) |
3250 | 2738 "paddw %5, %%mm1 \n\t" |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2739 "paddw %%mm3, %%mm2 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2740 "paddw %%mm1, %%mm0 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2741 "paddw %%mm2, %%mm0 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2742 |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2743 "psrlw %6, %%mm0 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2744 "packuswb %%mm0, %%mm0 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2745 "movd %%mm0, %0 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2746 |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2747 : "=m"(dst[x+y*stride]) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2748 : "m"(src[0]), "m"(src[1]), |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2749 "m"(src[stride]), "m"(src[stride+1]), |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2750 "m"(*r4), "m"(shift2) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2751 ); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2752 src += stride; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2753 } |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2754 src += 4-h*stride; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2755 } |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2756 } |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2757 |
3777 | 2758 #ifdef CONFIG_ENCODERS |
5024
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2759 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2760 #define PHADDD(a, t)\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2761 "movq "#a", "#t" \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2762 "psrlq $32, "#a" \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2763 "paddd "#t", "#a" \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2764 /* |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2765 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31] |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2766 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31] |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2767 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30] |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2768 */ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2769 #define PMULHRW(x, y, s, o)\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2770 "pmulhw " #s ", "#x " \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2771 "pmulhw " #s ", "#y " \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2772 "paddw " #o ", "#x " \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2773 "paddw " #o ", "#y " \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2774 "psraw $1, "#x " \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2775 "psraw $1, "#y " \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2776 #define DEF(x) x ## _mmx |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2777 #define SET_RND MOVQ_WONE |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2778 #define SCALE_OFFSET 1 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2779 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2780 #include "dsputil_mmx_qns.h" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2781 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2782 #undef DEF |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2783 #undef SET_RND |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2784 #undef SCALE_OFFSET |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2785 #undef PMULHRW |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2786 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2787 #define DEF(x) x ## _3dnow |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2788 #define SET_RND(x) |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2789 #define SCALE_OFFSET 0 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2790 #define PMULHRW(x, y, s, o)\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2791 "pmulhrw " #s ", "#x " \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2792 "pmulhrw " #s ", "#y " \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2793 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2794 #include "dsputil_mmx_qns.h" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2795 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2796 #undef DEF |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2797 #undef SET_RND |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2798 #undef SCALE_OFFSET |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2799 #undef PMULHRW |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2800 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2801 #ifdef HAVE_SSSE3 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2802 #undef PHADDD |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2803 #define DEF(x) x ## _ssse3 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2804 #define SET_RND(x) |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2805 #define SCALE_OFFSET -1 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2806 #define PHADDD(a, t)\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2807 "pshufw $0x0E, "#a", "#t" \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2808 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2809 #define PMULHRW(x, y, s, o)\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2810 "pmulhrsw " #s ", "#x " \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2811 "pmulhrsw " #s ", "#y " \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2812 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2813 #include "dsputil_mmx_qns.h" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2814 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2815 #undef DEF |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2816 #undef SET_RND |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2817 #undef SCALE_OFFSET |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2818 #undef PMULHRW |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2819 #undef PHADDD |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2820 #endif //HAVE_SSSE3 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2821 |
3777 | 2822 #endif /* CONFIG_ENCODERS */ |
2754 | 2823 |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2824 #define PREFETCH(name, op) \ |
4172 | 2825 static void name(void *mem, int stride, int h){\ |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2826 const uint8_t *p= mem;\ |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2827 do{\ |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2828 asm volatile(#op" %0" :: "m"(*p));\ |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2829 p+= stride;\ |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2830 }while(--h);\ |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2831 } |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2832 PREFETCH(prefetch_mmx2, prefetcht0) |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2833 PREFETCH(prefetch_3dnow, prefetch) |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2834 #undef PREFETCH |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2835 |
2754 | 2836 #include "h264dsp_mmx.c" |
2967 | 2837 |
3524 | 2838 /* AVS specific */ |
2839 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx); | |
2840 | |
2841 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | |
2842 put_pixels8_mmx(dst, src, stride, 8); | |
2843 } | |
2844 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | |
2845 avg_pixels8_mmx(dst, src, stride, 8); | |
2846 } | |
2847 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | |
2848 put_pixels16_mmx(dst, src, stride, 16); | |
2849 } | |
2850 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | |
2851 avg_pixels16_mmx(dst, src, stride, 16); | |
2852 } | |
2853 | |
1092 | 2854 /* external functions, from idct_mmx.c */ |
2855 void ff_mmx_idct(DCTELEM *block); | |
2856 void ff_mmxext_idct(DCTELEM *block); | |
2857 | |
2858 /* XXX: those functions should be suppressed ASAP when all IDCTs are | |
2859 converted */ | |
4020
723818b5de0f
Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure
diego
parents:
4001
diff
changeset
|
2860 #ifdef CONFIG_GPL |
1092 | 2861 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) |
2862 { | |
2863 ff_mmx_idct (block); | |
2864 put_pixels_clamped_mmx(block, dest, line_size); | |
2865 } | |
2866 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
2867 { | |
2868 ff_mmx_idct (block); | |
2869 add_pixels_clamped_mmx(block, dest, line_size); | |
2870 } | |
2871 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block) | |
2872 { | |
2873 ff_mmxext_idct (block); | |
2874 put_pixels_clamped_mmx(block, dest, line_size); | |
2875 } | |
2876 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
2877 { | |
2878 ff_mmxext_idct (block); | |
2879 add_pixels_clamped_mmx(block, dest, line_size); | |
2880 } | |
4020
723818b5de0f
Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure
diego
parents:
4001
diff
changeset
|
2881 #endif |
2868 | 2882 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) |
2883 { | |
2884 ff_idct_xvid_mmx (block); | |
2885 put_pixels_clamped_mmx(block, dest, line_size); | |
2886 } | |
2887 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block) | |
2888 { | |
2889 ff_idct_xvid_mmx (block); | |
2890 add_pixels_clamped_mmx(block, dest, line_size); | |
2891 } | |
2892 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block) | |
2893 { | |
2894 ff_idct_xvid_mmx2 (block); | |
2895 put_pixels_clamped_mmx(block, dest, line_size); | |
2896 } | |
2897 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) | |
2898 { | |
2899 ff_idct_xvid_mmx2 (block); | |
2900 add_pixels_clamped_mmx(block, dest, line_size); | |
2901 } | |
2967 | 2902 |
3541
3fbddeb13686
10l, vorbis_inverse_coupling_sse() was really 3dnow
lorenm
parents:
3536
diff
changeset
|
2903 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize) |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2904 { |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2905 int i; |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2906 asm volatile("pxor %%mm7, %%mm7":); |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2907 for(i=0; i<blocksize; i+=2) { |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2908 asm volatile( |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2909 "movq %0, %%mm0 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2910 "movq %1, %%mm1 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2911 "movq %%mm0, %%mm2 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2912 "movq %%mm1, %%mm3 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2913 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0 |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2914 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0 |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2915 "pslld $31, %%mm2 \n\t" // keep only the sign bit |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2916 "pxor %%mm2, %%mm1 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2917 "movq %%mm3, %%mm4 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2918 "pand %%mm1, %%mm3 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2919 "pandn %%mm1, %%mm4 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2920 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2921 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2922 "movq %%mm3, %1 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2923 "movq %%mm0, %0 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2924 :"+m"(mag[i]), "+m"(ang[i]) |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2925 ::"memory" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2926 ); |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2927 } |
3561 | 2928 asm volatile("femms"); |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2929 } |
3557
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2930 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2931 { |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2932 int i; |
3557
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2933 |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2934 asm volatile( |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2935 "movaps %0, %%xmm5 \n\t" |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2936 ::"m"(ff_pdw_80000000[0]) |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2937 ); |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2938 for(i=0; i<blocksize; i+=4) { |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2939 asm volatile( |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2940 "movaps %0, %%xmm0 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2941 "movaps %1, %%xmm1 \n\t" |
3557
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2942 "xorps %%xmm2, %%xmm2 \n\t" |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2943 "xorps %%xmm3, %%xmm3 \n\t" |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2944 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0 |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2945 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0 |
3557
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2946 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2947 "xorps %%xmm2, %%xmm1 \n\t" |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2948 "movaps %%xmm3, %%xmm4 \n\t" |
3557
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2949 "andps %%xmm1, %%xmm3 \n\t" |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2950 "andnps %%xmm1, %%xmm4 \n\t" |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2951 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2952 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2953 "movaps %%xmm3, %1 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2954 "movaps %%xmm0, %0 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2955 :"+m"(mag[i]), "+m"(ang[i]) |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2956 ::"memory" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2957 ); |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2958 } |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2959 } |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2960 |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
2961 static void vector_fmul_3dnow(float *dst, const float *src, int len){ |
3574 | 2962 long i = (len-4)*4; |
2963 asm volatile( | |
2964 "1: \n\t" | |
2965 "movq (%1,%0), %%mm0 \n\t" | |
2966 "movq 8(%1,%0), %%mm1 \n\t" | |
2967 "pfmul (%2,%0), %%mm0 \n\t" | |
2968 "pfmul 8(%2,%0), %%mm1 \n\t" | |
2969 "movq %%mm0, (%1,%0) \n\t" | |
2970 "movq %%mm1, 8(%1,%0) \n\t" | |
2971 "sub $16, %0 \n\t" | |
2972 "jge 1b \n\t" | |
2973 "femms \n\t" | |
2974 :"+r"(i) | |
2975 :"r"(dst), "r"(src) | |
2976 :"memory" | |
2977 ); | |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
2978 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
2979 static void vector_fmul_sse(float *dst, const float *src, int len){ |
3574 | 2980 long i = (len-8)*4; |
2981 asm volatile( | |
2982 "1: \n\t" | |
2983 "movaps (%1,%0), %%xmm0 \n\t" | |
2984 "movaps 16(%1,%0), %%xmm1 \n\t" | |
2985 "mulps (%2,%0), %%xmm0 \n\t" | |
2986 "mulps 16(%2,%0), %%xmm1 \n\t" | |
2987 "movaps %%xmm0, (%1,%0) \n\t" | |
2988 "movaps %%xmm1, 16(%1,%0) \n\t" | |
2989 "sub $32, %0 \n\t" | |
2990 "jge 1b \n\t" | |
2991 :"+r"(i) | |
2992 :"r"(dst), "r"(src) | |
2993 :"memory" | |
2994 ); | |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
2995 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
2996 |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
2997 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
2998 long i = len*4-16; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
2999 asm volatile( |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3000 "1: \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3001 "pswapd 8(%1), %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3002 "pswapd (%1), %%mm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3003 "pfmul (%3,%0), %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3004 "pfmul 8(%3,%0), %%mm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3005 "movq %%mm0, (%2,%0) \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3006 "movq %%mm1, 8(%2,%0) \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3007 "add $16, %1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3008 "sub $16, %0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3009 "jge 1b \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3010 :"+r"(i), "+r"(src1) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3011 :"r"(dst), "r"(src0) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3012 ); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3013 asm volatile("femms"); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3014 } |
3569
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3015 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){ |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3016 long i = len*4-32; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3017 asm volatile( |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3018 "1: \n\t" |
3569
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3019 "movaps 16(%1), %%xmm0 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3020 "movaps (%1), %%xmm1 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3021 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3022 "shufps $0x1b, %%xmm1, %%xmm1 \n\t" |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3023 "mulps (%3,%0), %%xmm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3024 "mulps 16(%3,%0), %%xmm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3025 "movaps %%xmm0, (%2,%0) \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3026 "movaps %%xmm1, 16(%2,%0) \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3027 "add $32, %1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3028 "sub $32, %0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3029 "jge 1b \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3030 :"+r"(i), "+r"(src1) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3031 :"r"(dst), "r"(src0) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3032 ); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3033 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3034 |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3035 static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1, |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3036 const float *src2, int src3, int len, int step){ |
3574 | 3037 long i = (len-4)*4; |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3038 if(step == 2 && src3 == 0){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3039 dst += (len-4)*2; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3040 asm volatile( |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3041 "1: \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3042 "movq (%2,%0), %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3043 "movq 8(%2,%0), %%mm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3044 "pfmul (%3,%0), %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3045 "pfmul 8(%3,%0), %%mm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3046 "pfadd (%4,%0), %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3047 "pfadd 8(%4,%0), %%mm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3048 "movd %%mm0, (%1) \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3049 "movd %%mm1, 16(%1) \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3050 "psrlq $32, %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3051 "psrlq $32, %%mm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3052 "movd %%mm0, 8(%1) \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3053 "movd %%mm1, 24(%1) \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3054 "sub $32, %1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3055 "sub $16, %0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3056 "jge 1b \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3057 :"+r"(i), "+r"(dst) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3058 :"r"(src0), "r"(src1), "r"(src2) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3059 :"memory" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3060 ); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3061 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3062 else if(step == 1 && src3 == 0){ |
3574 | 3063 asm volatile( |
3064 "1: \n\t" | |
3065 "movq (%2,%0), %%mm0 \n\t" | |
3066 "movq 8(%2,%0), %%mm1 \n\t" | |
3067 "pfmul (%3,%0), %%mm0 \n\t" | |
3068 "pfmul 8(%3,%0), %%mm1 \n\t" | |
3069 "pfadd (%4,%0), %%mm0 \n\t" | |
3070 "pfadd 8(%4,%0), %%mm1 \n\t" | |
3071 "movq %%mm0, (%1,%0) \n\t" | |
3072 "movq %%mm1, 8(%1,%0) \n\t" | |
3073 "sub $16, %0 \n\t" | |
3074 "jge 1b \n\t" | |
3075 :"+r"(i) | |
3076 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) | |
3077 :"memory" | |
3078 ); | |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3079 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3080 else |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3081 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3082 asm volatile("femms"); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3083 } |
3569
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3084 static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1, |
3574 | 3085 const float *src2, int src3, int len, int step){ |
3086 long i = (len-8)*4; | |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3087 if(step == 2 && src3 == 0){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3088 dst += (len-8)*2; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3089 asm volatile( |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3090 "1: \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3091 "movaps (%2,%0), %%xmm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3092 "movaps 16(%2,%0), %%xmm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3093 "mulps (%3,%0), %%xmm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3094 "mulps 16(%3,%0), %%xmm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3095 "addps (%4,%0), %%xmm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3096 "addps 16(%4,%0), %%xmm1 \n\t" |
3569
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3097 "movss %%xmm0, (%1) \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3098 "movss %%xmm1, 32(%1) \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3099 "movhlps %%xmm0, %%xmm2 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3100 "movhlps %%xmm1, %%xmm3 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3101 "movss %%xmm2, 16(%1) \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3102 "movss %%xmm3, 48(%1) \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3103 "shufps $0xb1, %%xmm0, %%xmm0 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3104 "shufps $0xb1, %%xmm1, %%xmm1 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3105 "movss %%xmm0, 8(%1) \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3106 "movss %%xmm1, 40(%1) \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3107 "movhlps %%xmm0, %%xmm2 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3108 "movhlps %%xmm1, %%xmm3 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3109 "movss %%xmm2, 24(%1) \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3110 "movss %%xmm3, 56(%1) \n\t" |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3111 "sub $64, %1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3112 "sub $32, %0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3113 "jge 1b \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3114 :"+r"(i), "+r"(dst) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3115 :"r"(src0), "r"(src1), "r"(src2) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3116 :"memory" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3117 ); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3118 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3119 else if(step == 1 && src3 == 0){ |
3574 | 3120 asm volatile( |
3121 "1: \n\t" | |
3122 "movaps (%2,%0), %%xmm0 \n\t" | |
3123 "movaps 16(%2,%0), %%xmm1 \n\t" | |
3124 "mulps (%3,%0), %%xmm0 \n\t" | |
3125 "mulps 16(%3,%0), %%xmm1 \n\t" | |
3126 "addps (%4,%0), %%xmm0 \n\t" | |
3127 "addps 16(%4,%0), %%xmm1 \n\t" | |
3128 "movaps %%xmm0, (%1,%0) \n\t" | |
3129 "movaps %%xmm1, 16(%1,%0) \n\t" | |
3130 "sub $32, %0 \n\t" | |
3131 "jge 1b \n\t" | |
3132 :"+r"(i) | |
3133 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) | |
3134 :"memory" | |
3135 ); | |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3136 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3137 else |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3138 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3139 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3140 |
4172 | 3141 static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){ |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3142 // not bit-exact: pf2id uses different rounding than C and SSE |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3143 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3144 for(i=0; i<len; i+=4) { |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3145 asm volatile( |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3146 "pf2id %1, %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3147 "pf2id %2, %%mm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3148 "packssdw %%mm1, %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3149 "movq %%mm0, %0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3150 :"=m"(dst[i]) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3151 :"m"(src[i]), "m"(src[i+2]) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3152 ); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3153 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3154 asm volatile("femms"); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3155 } |
4172 | 3156 static void float_to_int16_sse(int16_t *dst, const float *src, int len){ |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3157 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3158 for(i=0; i<len; i+=4) { |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3159 asm volatile( |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3160 "cvtps2pi %1, %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3161 "cvtps2pi %2, %%mm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3162 "packssdw %%mm1, %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3163 "movq %%mm0, %0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3164 :"=m"(dst[i]) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3165 :"m"(src[i]), "m"(src[i+2]) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3166 ); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3167 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3168 asm volatile("emms"); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3169 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3170 |
4589
30261f4ed12d
Fix wrong conditional, Snow decoding, not encoding, was SIMD-accelerated.
diego
parents:
4436
diff
changeset
|
3171 #ifdef CONFIG_SNOW_DECODER |
3210 | 3172 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width); |
3173 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width); | |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3174 extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width); |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3175 extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width); |
4436
d3e389536b0a
Add the const specifier as needed to reduce the number of warnings.
takis
parents:
4197
diff
changeset
|
3176 extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
3177 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); |
4436
d3e389536b0a
Add the const specifier as needed to reduce the number of warnings.
takis
parents:
4197
diff
changeset
|
3178 extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
3179 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); |
3210 | 3180 #endif |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3181 |
1092 | 3182 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
0 | 3183 { |
4197 | 3184 mm_flags = mm_support(); |
1115 | 3185 |
1122 | 3186 if (avctx->dsp_mask) { |
2979 | 3187 if (avctx->dsp_mask & FF_MM_FORCE) |
4197 | 3188 mm_flags |= (avctx->dsp_mask & 0xffff); |
2979 | 3189 else |
4197 | 3190 mm_flags &= ~(avctx->dsp_mask & 0xffff); |
1122 | 3191 } |
1115 | 3192 |
631
47a8964ba5cd
be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents:
629
diff
changeset
|
3193 #if 0 |
1868 | 3194 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:"); |
4197 | 3195 if (mm_flags & MM_MMX) |
1868 | 3196 av_log(avctx, AV_LOG_INFO, " mmx"); |
4197 | 3197 if (mm_flags & MM_MMXEXT) |
1868 | 3198 av_log(avctx, AV_LOG_INFO, " mmxext"); |
4197 | 3199 if (mm_flags & MM_3DNOW) |
1868 | 3200 av_log(avctx, AV_LOG_INFO, " 3dnow"); |
4197 | 3201 if (mm_flags & MM_SSE) |
1868 | 3202 av_log(avctx, AV_LOG_INFO, " sse"); |
4197 | 3203 if (mm_flags & MM_SSE2) |
1868 | 3204 av_log(avctx, AV_LOG_INFO, " sse2"); |
3205 av_log(avctx, AV_LOG_INFO, "\n"); | |
0 | 3206 #endif |
3207 | |
4197 | 3208 if (mm_flags & MM_MMX) { |
1092 | 3209 const int idct_algo= avctx->idct_algo; |
3210 | |
1232
e88d3b1fb2a1
more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents:
1186
diff
changeset
|
3211 #ifdef CONFIG_ENCODERS |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1985
diff
changeset
|
3212 const int dct_algo = avctx->dct_algo; |
1565 | 3213 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ |
4197 | 3214 if(mm_flags & MM_SSE2){ |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
3215 c->fdct = ff_fdct_sse2; |
4197 | 3216 }else if(mm_flags & MM_MMXEXT){ |
1565 | 3217 c->fdct = ff_fdct_mmx2; |
3218 }else{ | |
3219 c->fdct = ff_fdct_mmx; | |
3220 } | |
3221 } | |
1232
e88d3b1fb2a1
more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents:
1186
diff
changeset
|
3222 #endif //CONFIG_ENCODERS |
2256 | 3223 if(avctx->lowres==0){ |
3224 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ | |
3225 c->idct_put= ff_simple_idct_put_mmx; | |
3226 c->idct_add= ff_simple_idct_add_mmx; | |
3227 c->idct = ff_simple_idct_mmx; | |
3228 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; | |
3717
ea9fe1c9d126
Remove the LGPL exception clause as discussed on ffmpeg-devel
diego
parents:
3712
diff
changeset
|
3229 #ifdef CONFIG_GPL |
2256 | 3230 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ |
4197 | 3231 if(mm_flags & MM_MMXEXT){ |
2256 | 3232 c->idct_put= ff_libmpeg2mmx2_idct_put; |
3233 c->idct_add= ff_libmpeg2mmx2_idct_add; | |
3234 c->idct = ff_mmxext_idct; | |
3235 }else{ | |
3236 c->idct_put= ff_libmpeg2mmx_idct_put; | |
3237 c->idct_add= ff_libmpeg2mmx_idct_add; | |
3238 c->idct = ff_mmx_idct; | |
3239 } | |
3240 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; | |
3717
ea9fe1c9d126
Remove the LGPL exception clause as discussed on ffmpeg-devel
diego
parents:
3712
diff
changeset
|
3241 #endif |
5007 | 3242 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) && |
3243 idct_algo==FF_IDCT_VP3 && | |
3721
2000e401593d
disable vp3 mmx idct for theora files to avoid artifacts
aurel
parents:
3717
diff
changeset
|
3244 avctx->codec->id!=CODEC_ID_THEORA && |
3712
f7f75f718efb
Enables back the mmx/sse optimized version of the vp3 idct.
aurel
parents:
3666
diff
changeset
|
3245 !(avctx->flags & CODEC_FLAG_BITEXACT)){ |
4197 | 3246 if(mm_flags & MM_SSE2){ |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3247 c->idct_put= ff_vp3_idct_put_sse2; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3248 c->idct_add= ff_vp3_idct_add_sse2; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3249 c->idct = ff_vp3_idct_sse2; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3250 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3251 }else{ |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3252 ff_vp3_dsp_init_mmx(); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3253 c->idct_put= ff_vp3_idct_put_mmx; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3254 c->idct_add= ff_vp3_idct_add_mmx; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3255 c->idct = ff_vp3_idct_mmx; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3256 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3257 } |
3524 | 3258 }else if(idct_algo==FF_IDCT_CAVS){ |
3259 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; | |
2868 | 3260 }else if(idct_algo==FF_IDCT_XVIDMMX){ |
4197 | 3261 if(mm_flags & MM_MMXEXT){ |
2868 | 3262 c->idct_put= ff_idct_xvid_mmx2_put; |
3263 c->idct_add= ff_idct_xvid_mmx2_add; | |
3264 c->idct = ff_idct_xvid_mmx2; | |
3265 }else{ | |
3266 c->idct_put= ff_idct_xvid_mmx_put; | |
3267 c->idct_add= ff_idct_xvid_mmx_add; | |
3268 c->idct = ff_idct_xvid_mmx; | |
3269 } | |
1092 | 3270 } |
3271 } | |
1868 | 3272 |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3273 #ifdef CONFIG_ENCODERS |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3274 c->get_pixels = get_pixels_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3275 c->diff_pixels = diff_pixels_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3276 #endif //CONFIG_ENCODERS |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3277 c->put_pixels_clamped = put_pixels_clamped_mmx; |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
3278 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx; |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3279 c->add_pixels_clamped = add_pixels_clamped_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3280 c->clear_blocks = clear_blocks_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3281 #ifdef CONFIG_ENCODERS |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3282 c->pix_sum = pix_sum16_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3283 #endif //CONFIG_ENCODERS |
415 | 3284 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3285 c->put_pixels_tab[0][0] = put_pixels16_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3286 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3287 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3288 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx; |
0 | 3289 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3290 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3291 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3292 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3293 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx; |
651 | 3294 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3295 c->avg_pixels_tab[0][0] = avg_pixels16_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3296 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3297 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3298 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx; |
415 | 3299 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3300 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3301 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3302 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3303 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3304 |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3305 c->put_pixels_tab[1][0] = put_pixels8_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3306 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3307 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3308 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx; |
0 | 3309 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3310 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3311 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3312 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3313 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx; |
651 | 3314 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3315 c->avg_pixels_tab[1][0] = avg_pixels8_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3316 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3317 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3318 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx; |
651 | 3319 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3320 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3321 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3322 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3323 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx; |
2967 | 3324 |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
3325 c->gmc= gmc_mmx; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
3326 |
866 | 3327 c->add_bytes= add_bytes_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3328 #ifdef CONFIG_ENCODERS |
866 | 3329 c->diff_bytes= diff_bytes_mmx; |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
3330 c->sum_abs_dctelem= sum_abs_dctelem_mmx; |
2967 | 3331 |
936 | 3332 c->hadamard8_diff[0]= hadamard8_diff16_mmx; |
3333 c->hadamard8_diff[1]= hadamard8_diff_mmx; | |
2967 | 3334 |
2979 | 3335 c->pix_norm1 = pix_norm1_mmx; |
4197 | 3336 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx; |
2979 | 3337 c->sse[1] = sse8_mmx; |
1729 | 3338 c->vsad[4]= vsad_intra16_mmx; |
3339 | |
2979 | 3340 c->nsse[0] = nsse16_mmx; |
3341 c->nsse[1] = nsse8_mmx; | |
1729 | 3342 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
3343 c->vsad[0] = vsad16_mmx; | |
3344 } | |
2967 | 3345 |
1784 | 3346 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
3347 c->try_8x8basis= try_8x8basis_mmx; | |
3348 } | |
3349 c->add_8x8basis= add_8x8basis_mmx; | |
2967 | 3350 |
4749 | 3351 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; |
3352 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3353 #endif //CONFIG_ENCODERS |
1647 | 3354 |
5277
7b3fcb7c61ce
Avoid linking with h263.c functions when the relevant codecs
aurel
parents:
5255
diff
changeset
|
3355 if (ENABLE_ANY_H263) { |
5278 | 3356 c->h263_v_loop_filter= h263_v_loop_filter_mmx; |
3357 c->h263_h_loop_filter= h263_h_loop_filter_mmx; | |
5277
7b3fcb7c61ce
Avoid linking with h263.c functions when the relevant codecs
aurel
parents:
5255
diff
changeset
|
3358 } |
2979 | 3359 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx; |
2922
d772011258ec
faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents:
2902
diff
changeset
|
3360 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx; |
2967 | 3361 |
3173 | 3362 c->h264_idct_dc_add= |
3363 c->h264_idct_add= ff_h264_idct_add_mmx; | |
3174 | 3364 c->h264_idct8_dc_add= |
3365 c->h264_idct8_add= ff_h264_idct8_add_mmx; | |
3173 | 3366 |
4197 | 3367 if (mm_flags & MM_MMXEXT) { |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
3368 c->prefetch = prefetch_mmx2; |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
3369 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3370 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3371 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; |
651 | 3372 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3373 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3374 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3375 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; |
415 | 3376 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3377 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3378 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2; |
651 | 3379 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3380 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3381 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3382 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; |
1092 | 3383 |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3384 #ifdef CONFIG_ENCODERS |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
3385 c->sum_abs_dctelem= sum_abs_dctelem_mmx2; |
1153 | 3386 c->hadamard8_diff[0]= hadamard8_diff16_mmx2; |
3387 c->hadamard8_diff[1]= hadamard8_diff_mmx2; | |
1729 | 3388 c->vsad[4]= vsad_intra16_mmx2; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3389 #endif //CONFIG_ENCODERS |
1153 | 3390 |
3105
2d35fb3cb940
h264: special case dc-only idct. ~1% faster overall
lorenm
parents:
3089
diff
changeset
|
3391 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; |
2d35fb3cb940
h264: special case dc-only idct. ~1% faster overall
lorenm
parents:
3089
diff
changeset
|
3392 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; |
2745 | 3393 |
1092 | 3394 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
3395 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; | |
3396 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; | |
3397 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; | |
3398 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; | |
3399 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; | |
3400 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; | |
1772
8cd5257195c9
vsad16_mmx2 only applies if encoders are turned on
melanson
parents:
1765
diff
changeset
|
3401 #ifdef CONFIG_ENCODERS |
1729 | 3402 c->vsad[0] = vsad16_mmx2; |
1772
8cd5257195c9
vsad16_mmx2 only applies if encoders are turned on
melanson
parents:
1765
diff
changeset
|
3403 #endif //CONFIG_ENCODERS |
1092 | 3404 } |
959 | 3405 |
961 | 3406 #if 1 |
954 | 3407 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2) |
3408 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2) | |
3409 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2) | |
3410 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2) | |
3411 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2) | |
3412 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2) | |
3413 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2) | |
3414 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2) | |
3415 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2) | |
3416 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2) | |
3417 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2) | |
3418 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2) | |
3419 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2) | |
3420 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2) | |
3421 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2) | |
3422 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2) | |
3423 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2) | |
3424 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2) | |
3425 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2) | |
3426 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2) | |
3427 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2) | |
3428 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2) | |
3429 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2) | |
3430 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2) | |
3431 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2) | |
3432 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2) | |
3433 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2) | |
3434 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2) | |
3435 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2) | |
3436 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2) | |
3437 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2) | |
3438 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2) | |
961 | 3439 #endif |
1527 | 3440 |
2209 | 3441 //FIXME 3dnow too |
3442 #define dspfunc(PFX, IDX, NUM) \ | |
3443 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \ | |
3444 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \ | |
3445 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \ | |
3446 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \ | |
3447 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \ | |
3448 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \ | |
3449 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \ | |
3450 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \ | |
3451 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \ | |
3452 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \ | |
3453 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \ | |
3454 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \ | |
3455 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \ | |
3456 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \ | |
3457 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \ | |
3458 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2 | |
3459 | |
3460 dspfunc(put_h264_qpel, 0, 16); | |
3461 dspfunc(put_h264_qpel, 1, 8); | |
3462 dspfunc(put_h264_qpel, 2, 4); | |
3463 dspfunc(avg_h264_qpel, 0, 16); | |
3464 dspfunc(avg_h264_qpel, 1, 8); | |
3465 dspfunc(avg_h264_qpel, 2, 4); | |
3807
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
3466 |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
3467 dspfunc(put_2tap_qpel, 0, 16); |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
3468 dspfunc(put_2tap_qpel, 1, 8); |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
3469 dspfunc(avg_2tap_qpel, 0, 16); |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
3470 dspfunc(avg_2tap_qpel, 1, 8); |
2209 | 3471 #undef dspfunc |
3472 | |
2979 | 3473 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2; |
2922
d772011258ec
faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents:
2902
diff
changeset
|
3474 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2; |
3213 | 3475 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2; |
3476 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2; | |
2633 | 3477 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; |
3478 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; | |
3479 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; | |
3480 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3481 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3482 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; |
3645
47821be55b6c
mmx implementation of deblocking strength decision.
lorenm
parents:
3576
diff
changeset
|
3483 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; |
2633 | 3484 |
2902
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3485 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3486 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3487 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3488 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3489 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3490 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3491 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3492 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3493 |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3494 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3495 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3496 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3497 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3498 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3499 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3500 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3501 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3502 |
3524 | 3503 #ifdef CONFIG_CAVS_DECODER |
3504 ff_cavsdsp_init_mmx2(c, avctx); | |
3505 #endif | |
3506 | |
1686
68abbec33289
Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents:
1648
diff
changeset
|
3507 #ifdef CONFIG_ENCODERS |
1527 | 3508 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; |
1686
68abbec33289
Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents:
1648
diff
changeset
|
3509 #endif //CONFIG_ENCODERS |
4197 | 3510 } else if (mm_flags & MM_3DNOW) { |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
3511 c->prefetch = prefetch_3dnow; |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
3512 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3513 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3514 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; |
393 | 3515 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3516 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3517 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3518 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; |
651 | 3519 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3520 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3521 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3522 |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3523 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3524 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3525 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; |
1092 | 3526 |
3527 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
3528 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; | |
3529 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; | |
3530 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; | |
3531 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; | |
3532 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; | |
3533 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; | |
3534 } | |
984 | 3535 |
954 | 3536 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow) |
3537 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow) | |
3538 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow) | |
3539 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow) | |
3540 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow) | |
3541 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow) | |
3542 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow) | |
3543 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow) | |
3544 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow) | |
3545 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow) | |
3546 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow) | |
3547 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow) | |
3548 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow) | |
3549 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow) | |
3550 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow) | |
3551 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow) | |
3552 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow) | |
3553 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow) | |
3554 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow) | |
3555 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow) | |
3556 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow) | |
3557 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow) | |
3558 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow) | |
3559 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow) | |
3560 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow) | |
3561 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow) | |
3562 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow) | |
3563 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow) | |
3564 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow) | |
3565 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow) | |
3566 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow) | |
3567 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow) | |
2209 | 3568 |
3569 #define dspfunc(PFX, IDX, NUM) \ | |
3570 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \ | |
3571 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \ | |
3572 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \ | |
3573 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \ | |
3574 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \ | |
3575 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \ | |
3576 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \ | |
3577 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \ | |
3578 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \ | |
3579 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \ | |
3580 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \ | |
3581 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \ | |
3582 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \ | |
3583 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \ | |
3584 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \ | |
3585 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow | |
3586 | |
3587 dspfunc(put_h264_qpel, 0, 16); | |
3588 dspfunc(put_h264_qpel, 1, 8); | |
3589 dspfunc(put_h264_qpel, 2, 4); | |
3590 dspfunc(avg_h264_qpel, 0, 16); | |
3591 dspfunc(avg_h264_qpel, 1, 8); | |
3592 dspfunc(avg_h264_qpel, 2, 4); | |
2732 | 3593 |
3807
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
3594 dspfunc(put_2tap_qpel, 0, 16); |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
3595 dspfunc(put_2tap_qpel, 1, 8); |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
3596 dspfunc(avg_2tap_qpel, 0, 16); |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
3597 dspfunc(avg_2tap_qpel, 1, 8); |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
3598 |
2979 | 3599 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow; |
2922
d772011258ec
faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents:
2902
diff
changeset
|
3600 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow; |
0 | 3601 } |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3602 |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3603 #ifdef CONFIG_ENCODERS |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3604 if(mm_flags & MM_SSE2){ |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
3605 c->sum_abs_dctelem= sum_abs_dctelem_sse2; |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3606 c->hadamard8_diff[0]= hadamard8_diff16_sse2; |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3607 c->hadamard8_diff[1]= hadamard8_diff_sse2; |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3608 } |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3609 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3610 #ifdef HAVE_SSSE3 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3611 if(mm_flags & MM_SSSE3){ |
5024
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3612 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3613 c->try_8x8basis= try_8x8basis_ssse3; |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3614 } |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3615 c->add_8x8basis= add_8x8basis_ssse3; |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
3616 c->sum_abs_dctelem= sum_abs_dctelem_ssse3; |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3617 c->hadamard8_diff[0]= hadamard8_diff16_ssse3; |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3618 c->hadamard8_diff[1]= hadamard8_diff_ssse3; |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3619 } |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3620 #endif |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3621 #endif |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3622 |
4589
30261f4ed12d
Fix wrong conditional, Snow decoding, not encoding, was SIMD-accelerated.
diego
parents:
4436
diff
changeset
|
3623 #ifdef CONFIG_SNOW_DECODER |
5591 | 3624 if(mm_flags & MM_SSE2 & 0){ |
3210 | 3625 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3626 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
3627 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3628 } |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3629 else{ |
3210 | 3630 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3631 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
3632 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3633 } |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3634 #endif |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
3635 |
4197 | 3636 if(mm_flags & MM_3DNOW){ |
5024
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3637 #ifdef CONFIG_ENCODERS |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3638 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3639 c->try_8x8basis= try_8x8basis_3dnow; |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3640 } |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3641 c->add_8x8basis= add_8x8basis_3dnow; |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3642 #endif //CONFIG_ENCODERS |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3643 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3644 c->vector_fmul = vector_fmul_3dnow; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3645 if(!(avctx->flags & CODEC_FLAG_BITEXACT)) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3646 c->float_to_int16 = float_to_int16_3dnow; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3647 } |
4197 | 3648 if(mm_flags & MM_3DNOWEXT) |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3649 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; |
4197 | 3650 if(mm_flags & MM_SSE){ |
3557
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
3651 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3652 c->vector_fmul = vector_fmul_sse; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3653 c->float_to_int16 = float_to_int16_sse; |
3569
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3654 c->vector_fmul_reverse = vector_fmul_reverse_sse; |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3655 c->vector_fmul_add_add = vector_fmul_add_add_sse; |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3656 } |
4197 | 3657 if(mm_flags & MM_3DNOW) |
3574 | 3658 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse |
0 | 3659 } |
2967 | 3660 |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3661 #ifdef CONFIG_ENCODERS |
1092 | 3662 dsputil_init_pix_mmx(c, avctx); |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3663 #endif //CONFIG_ENCODERS |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3664 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3665 // for speed testing |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3666 get_pixels = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3667 put_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3668 add_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3669 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3670 pix_abs16x16 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3671 pix_abs16x16_x2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3672 pix_abs16x16_y2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3673 pix_abs16x16_xy2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3674 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3675 put_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3676 put_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3677 put_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3678 put_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3679 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3680 put_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3681 put_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3682 put_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3683 put_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3684 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3685 avg_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3686 avg_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3687 avg_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3688 avg_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3689 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3690 avg_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3691 avg_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3692 avg_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3693 avg_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3694 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3695 //av_fdct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3696 //ff_idct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3697 #endif |
0 | 3698 } |