Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 2918:13dcd22f0816 libavcodec
Add DTS_INC to the CFLAGS for DTS. This is only set by external configure
programs (such as the MPlayer one) and thus somewhat hackish. We already
do this for things like MLIB_INC and IPP_INC so it should be acceptable.
author | diego |
---|---|
date | Sun, 23 Oct 2005 18:16:53 +0000 |
parents | 3c79bc9f3aa9 |
children | d772011258ec |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
1739
07a484280a82
copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents:
1729
diff
changeset
|
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
0 | 5 * |
429 | 6 * This library is free software; you can redistribute it and/or |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2 of the License, or (at your option) any later version. | |
0 | 10 * |
429 | 11 * This library is distributed in the hope that it will be useful, |
0 | 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 * Lesser General Public License for more details. | |
0 | 15 * |
429 | 16 * You should have received a copy of the GNU Lesser General Public |
17 * License along with this library; if not, write to the Free Software | |
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
0 | 19 * |
20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
21 */ | |
22 | |
23 #include "../dsputil.h" | |
1092 | 24 #include "../simple_idct.h" |
2067 | 25 #include "../mpegvideo.h" |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
26 #include "mmx.h" |
0 | 27 |
1729 | 28 //#undef NDEBUG |
29 //#include <assert.h> | |
30 | |
1647 | 31 extern const uint8_t ff_h263_loop_filter_strength[32]; |
2868 | 32 extern void ff_idct_xvid_mmx(short *block); |
33 extern void ff_idct_xvid_mmx2(short *block); | |
1647 | 34 |
5 | 35 int mm_flags; /* multimedia extension flags */ |
936 | 36 |
0 | 37 /* pixel operations */ |
1845
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
38 static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL; |
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
39 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL; |
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
40 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL; |
0 | 41 |
1845
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
42 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL; |
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
43 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL; |
2633 | 44 static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL; |
2209 | 45 static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL; |
1845
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
46 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL; |
2209 | 47 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL; |
2754 | 48 static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL; |
1845
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
49 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; |
954 | 50 |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
51 static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL; |
1845
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
52 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL; |
1647 | 53 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
54 #define JUMPALIGN() __asm __volatile (".balign 8"::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
55 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
56 |
448 | 57 #define MOVQ_WONE(regd) \ |
58 __asm __volatile ( \ | |
59 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
60 "psrlw $15, %%" #regd ::) | |
61 | |
62 #define MOVQ_BFE(regd) \ | |
63 __asm __volatile ( \ | |
64 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ | |
65 "paddb %%" #regd ", %%" #regd " \n\t" ::) | |
66 | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
67 #ifndef PIC |
448 | 68 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone)) |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
69 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
70 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
71 // for shared library it's better to use this way for accessing constants |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
72 // pcmpeqd -> -1 |
448 | 73 #define MOVQ_BONE(regd) \ |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
74 __asm __volatile ( \ |
448 | 75 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
76 "psrlw $15, %%" #regd " \n\t" \ | |
77 "packuswb %%" #regd ", %%" #regd " \n\t" ::) | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
78 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
79 #define MOVQ_WTWO(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
80 __asm __volatile ( \ |
448 | 81 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
82 "psrlw $15, %%" #regd " \n\t" \ | |
83 "psllw $1, %%" #regd " \n\t"::) | |
387 | 84 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
85 #endif |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
86 |
448 | 87 // using regr as temporary and for the output result |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
88 // first argument is unmodifed and second is trashed |
471 | 89 // regfe is supposed to contain 0xfefefefefefefefe |
90 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ | |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
91 "movq " #rega ", " #regr " \n\t"\ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
92 "pand " #regb ", " #regr " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
93 "pxor " #rega ", " #regb " \n\t"\ |
471 | 94 "pand " #regfe "," #regb " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
95 "psrlq $1, " #regb " \n\t"\ |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
96 "paddb " #regb ", " #regr " \n\t" |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
97 |
471 | 98 #define PAVGB_MMX(rega, regb, regr, regfe) \ |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
99 "movq " #rega ", " #regr " \n\t"\ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
100 "por " #regb ", " #regr " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
101 "pxor " #rega ", " #regb " \n\t"\ |
471 | 102 "pand " #regfe "," #regb " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
103 "psrlq $1, " #regb " \n\t"\ |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
104 "psubb " #regb ", " #regr " \n\t" |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
105 |
471 | 106 // mm6 is supposed to contain 0xfefefefefefefefe |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
107 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
108 "movq " #rega ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
109 "movq " #regc ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
110 "pand " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
111 "pand " #regd ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
112 "pxor " #rega ", " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
113 "pxor " #regc ", " #regd " \n\t"\ |
448 | 114 "pand %%mm6, " #regb " \n\t"\ |
115 "pand %%mm6, " #regd " \n\t"\ | |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
116 "psrlq $1, " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
117 "psrlq $1, " #regd " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
118 "paddb " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
119 "paddb " #regd ", " #regp " \n\t" |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
120 |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
121 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
122 "movq " #rega ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
123 "movq " #regc ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
124 "por " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
125 "por " #regd ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
126 "pxor " #rega ", " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
127 "pxor " #regc ", " #regd " \n\t"\ |
448 | 128 "pand %%mm6, " #regb " \n\t"\ |
129 "pand %%mm6, " #regd " \n\t"\ | |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
130 "psrlq $1, " #regd " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
131 "psrlq $1, " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
132 "psubb " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
133 "psubb " #regd ", " #regp " \n\t" |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
134 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
135 /***********************************/ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
136 /* MMX no rounding */ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
137 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx |
448 | 138 #define SET_RND MOVQ_WONE |
139 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) | |
471 | 140 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
141 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
142 #include "dsputil_mmx_rnd.h" |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
143 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
144 #undef DEF |
448 | 145 #undef SET_RND |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
146 #undef PAVGBP |
471 | 147 #undef PAVGB |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
148 /***********************************/ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
149 /* MMX rounding */ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
150 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
151 #define DEF(x, y) x ## _ ## y ##_mmx |
448 | 152 #define SET_RND MOVQ_WTWO |
153 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) | |
471 | 154 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
155 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
156 #include "dsputil_mmx_rnd.h" |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
157 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
158 #undef DEF |
448 | 159 #undef SET_RND |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
160 #undef PAVGBP |
471 | 161 #undef PAVGB |
387 | 162 |
0 | 163 /***********************************/ |
164 /* 3Dnow specific */ | |
165 | |
166 #define DEF(x) x ## _3dnow | |
167 /* for Athlons PAVGUSB is prefered */ | |
168 #define PAVGB "pavgusb" | |
169 | |
170 #include "dsputil_mmx_avg.h" | |
171 | |
172 #undef DEF | |
173 #undef PAVGB | |
174 | |
175 /***********************************/ | |
176 /* MMX2 specific */ | |
177 | |
386 | 178 #define DEF(x) x ## _mmx2 |
0 | 179 |
180 /* Introduced only in MMX2 set */ | |
181 #define PAVGB "pavgb" | |
182 | |
183 #include "dsputil_mmx_avg.h" | |
184 | |
185 #undef DEF | |
186 #undef PAVGB | |
187 | |
188 /***********************************/ | |
189 /* standard MMX */ | |
190 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
191 #ifdef CONFIG_ENCODERS |
1064 | 192 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) |
0 | 193 { |
386 | 194 asm volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
195 "mov $-128, %%"REG_a" \n\t" |
386 | 196 "pxor %%mm7, %%mm7 \n\t" |
197 ".balign 16 \n\t" | |
198 "1: \n\t" | |
199 "movq (%0), %%mm0 \n\t" | |
200 "movq (%0, %2), %%mm2 \n\t" | |
201 "movq %%mm0, %%mm1 \n\t" | |
202 "movq %%mm2, %%mm3 \n\t" | |
203 "punpcklbw %%mm7, %%mm0 \n\t" | |
204 "punpckhbw %%mm7, %%mm1 \n\t" | |
205 "punpcklbw %%mm7, %%mm2 \n\t" | |
206 "punpckhbw %%mm7, %%mm3 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
207 "movq %%mm0, (%1, %%"REG_a")\n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
208 "movq %%mm1, 8(%1, %%"REG_a")\n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
209 "movq %%mm2, 16(%1, %%"REG_a")\n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
210 "movq %%mm3, 24(%1, %%"REG_a")\n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
211 "add %3, %0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
212 "add $32, %%"REG_a" \n\t" |
386 | 213 "js 1b \n\t" |
214 : "+r" (pixels) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
215 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
216 : "%"REG_a |
386 | 217 ); |
0 | 218 } |
219 | |
1064 | 220 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) |
324 | 221 { |
222 asm volatile( | |
386 | 223 "pxor %%mm7, %%mm7 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
224 "mov $-128, %%"REG_a" \n\t" |
324 | 225 ".balign 16 \n\t" |
226 "1: \n\t" | |
227 "movq (%0), %%mm0 \n\t" | |
228 "movq (%1), %%mm2 \n\t" | |
229 "movq %%mm0, %%mm1 \n\t" | |
230 "movq %%mm2, %%mm3 \n\t" | |
231 "punpcklbw %%mm7, %%mm0 \n\t" | |
232 "punpckhbw %%mm7, %%mm1 \n\t" | |
233 "punpcklbw %%mm7, %%mm2 \n\t" | |
234 "punpckhbw %%mm7, %%mm3 \n\t" | |
235 "psubw %%mm2, %%mm0 \n\t" | |
236 "psubw %%mm3, %%mm1 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
237 "movq %%mm0, (%2, %%"REG_a")\n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
238 "movq %%mm1, 8(%2, %%"REG_a")\n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
239 "add %3, %0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
240 "add %3, %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
241 "add $16, %%"REG_a" \n\t" |
324 | 242 "jnz 1b \n\t" |
243 : "+r" (s1), "+r" (s2) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
244 : "r" (block+64), "r" ((long)stride) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
245 : "%"REG_a |
324 | 246 ); |
247 } | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
248 #endif //CONFIG_ENCODERS |
324 | 249 |
1064 | 250 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
0 | 251 { |
252 const DCTELEM *p; | |
1064 | 253 uint8_t *pix; |
0 | 254 |
255 /* read the pixels */ | |
256 p = block; | |
257 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
258 /* unrolled loop */ |
0 | 259 __asm __volatile( |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
260 "movq %3, %%mm0\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
261 "movq 8%3, %%mm1\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
262 "movq 16%3, %%mm2\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
263 "movq 24%3, %%mm3\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
264 "movq 32%3, %%mm4\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
265 "movq 40%3, %%mm5\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
266 "movq 48%3, %%mm6\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
267 "movq 56%3, %%mm7\n\t" |
0 | 268 "packuswb %%mm1, %%mm0\n\t" |
269 "packuswb %%mm3, %%mm2\n\t" | |
270 "packuswb %%mm5, %%mm4\n\t" | |
271 "packuswb %%mm7, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
272 "movq %%mm0, (%0)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
273 "movq %%mm2, (%0, %1)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
274 "movq %%mm4, (%0, %1, 2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
275 "movq %%mm6, (%0, %2)\n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
276 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p) |
0 | 277 :"memory"); |
278 pix += line_size*4; | |
279 p += 32; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
280 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
281 // if here would be an exact copy of the code above |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
282 // compiler would generate some very strange code |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
283 // thus using "r" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
284 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
285 "movq (%3), %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
286 "movq 8(%3), %%mm1\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
287 "movq 16(%3), %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
288 "movq 24(%3), %%mm3\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
289 "movq 32(%3), %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
290 "movq 40(%3), %%mm5\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
291 "movq 48(%3), %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
292 "movq 56(%3), %%mm7\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
293 "packuswb %%mm1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
294 "packuswb %%mm3, %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
295 "packuswb %%mm5, %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
296 "packuswb %%mm7, %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
297 "movq %%mm0, (%0)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
298 "movq %%mm2, (%0, %1)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
299 "movq %%mm4, (%0, %1, 2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
300 "movq %%mm6, (%0, %2)\n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
301 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p) |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
302 :"memory"); |
0 | 303 } |
304 | |
2753 | 305 static const unsigned char __align8 vector128[8] = |
1985
b2bc62fdecc0
move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents:
1984
diff
changeset
|
306 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; |
b2bc62fdecc0
move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents:
1984
diff
changeset
|
307 |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
308 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
309 { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
310 int i; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
311 |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
312 movq_m2r(*vector128, mm1); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
313 for (i = 0; i < 8; i++) { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
314 movq_m2r(*(block), mm0); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
315 packsswb_m2r(*(block + 4), mm0); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
316 block += 8; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
317 paddb_r2r(mm1, mm0); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
318 movq_r2m(mm0, *pixels); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
319 pixels += line_size; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
320 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
321 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
322 |
1064 | 323 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
0 | 324 { |
325 const DCTELEM *p; | |
1064 | 326 uint8_t *pix; |
0 | 327 int i; |
328 | |
329 /* read the pixels */ | |
330 p = block; | |
331 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
332 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
333 i = 4; |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
334 do { |
0 | 335 __asm __volatile( |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
336 "movq (%2), %%mm0\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
337 "movq 8(%2), %%mm1\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
338 "movq 16(%2), %%mm2\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
339 "movq 24(%2), %%mm3\n\t" |
0 | 340 "movq %0, %%mm4\n\t" |
341 "movq %1, %%mm6\n\t" | |
342 "movq %%mm4, %%mm5\n\t" | |
343 "punpcklbw %%mm7, %%mm4\n\t" | |
344 "punpckhbw %%mm7, %%mm5\n\t" | |
345 "paddsw %%mm4, %%mm0\n\t" | |
346 "paddsw %%mm5, %%mm1\n\t" | |
347 "movq %%mm6, %%mm5\n\t" | |
348 "punpcklbw %%mm7, %%mm6\n\t" | |
349 "punpckhbw %%mm7, %%mm5\n\t" | |
350 "paddsw %%mm6, %%mm2\n\t" | |
351 "paddsw %%mm5, %%mm3\n\t" | |
352 "packuswb %%mm1, %%mm0\n\t" | |
353 "packuswb %%mm3, %%mm2\n\t" | |
354 "movq %%mm0, %0\n\t" | |
355 "movq %%mm2, %1\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
356 :"+m"(*pix), "+m"(*(pix+line_size)) |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
357 :"r"(p) |
0 | 358 :"memory"); |
359 pix += line_size*2; | |
360 p += 16; | |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
361 } while (--i); |
0 | 362 } |
363 | |
2209 | 364 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
365 { | |
366 __asm __volatile( | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
367 "lea (%3, %3), %%"REG_a" \n\t" |
2209 | 368 ".balign 8 \n\t" |
369 "1: \n\t" | |
370 "movd (%1), %%mm0 \n\t" | |
371 "movd (%1, %3), %%mm1 \n\t" | |
372 "movd %%mm0, (%2) \n\t" | |
373 "movd %%mm1, (%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
374 "add %%"REG_a", %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
375 "add %%"REG_a", %2 \n\t" |
2209 | 376 "movd (%1), %%mm0 \n\t" |
377 "movd (%1, %3), %%mm1 \n\t" | |
378 "movd %%mm0, (%2) \n\t" | |
379 "movd %%mm1, (%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
380 "add %%"REG_a", %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
381 "add %%"REG_a", %2 \n\t" |
2209 | 382 "subl $4, %0 \n\t" |
383 "jnz 1b \n\t" | |
384 : "+g"(h), "+r" (pixels), "+r" (block) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
385 : "r"((long)line_size) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
386 : "%"REG_a, "memory" |
2209 | 387 ); |
388 } | |
389 | |
1064 | 390 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 391 { |
471 | 392 __asm __volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
393 "lea (%3, %3), %%"REG_a" \n\t" |
422 | 394 ".balign 8 \n\t" |
420 | 395 "1: \n\t" |
396 "movq (%1), %%mm0 \n\t" | |
397 "movq (%1, %3), %%mm1 \n\t" | |
398 "movq %%mm0, (%2) \n\t" | |
399 "movq %%mm1, (%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
400 "add %%"REG_a", %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
401 "add %%"REG_a", %2 \n\t" |
420 | 402 "movq (%1), %%mm0 \n\t" |
403 "movq (%1, %3), %%mm1 \n\t" | |
404 "movq %%mm0, (%2) \n\t" | |
405 "movq %%mm1, (%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
406 "add %%"REG_a", %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
407 "add %%"REG_a", %2 \n\t" |
420 | 408 "subl $4, %0 \n\t" |
409 "jnz 1b \n\t" | |
410 : "+g"(h), "+r" (pixels), "+r" (block) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
411 : "r"((long)line_size) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
412 : "%"REG_a, "memory" |
420 | 413 ); |
0 | 414 } |
415 | |
1064 | 416 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
651 | 417 { |
418 __asm __volatile( | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
419 "lea (%3, %3), %%"REG_a" \n\t" |
651 | 420 ".balign 8 \n\t" |
421 "1: \n\t" | |
422 "movq (%1), %%mm0 \n\t" | |
423 "movq 8(%1), %%mm4 \n\t" | |
424 "movq (%1, %3), %%mm1 \n\t" | |
425 "movq 8(%1, %3), %%mm5 \n\t" | |
426 "movq %%mm0, (%2) \n\t" | |
427 "movq %%mm4, 8(%2) \n\t" | |
428 "movq %%mm1, (%2, %3) \n\t" | |
429 "movq %%mm5, 8(%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
430 "add %%"REG_a", %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
431 "add %%"REG_a", %2 \n\t" |
651 | 432 "movq (%1), %%mm0 \n\t" |
433 "movq 8(%1), %%mm4 \n\t" | |
434 "movq (%1, %3), %%mm1 \n\t" | |
435 "movq 8(%1, %3), %%mm5 \n\t" | |
436 "movq %%mm0, (%2) \n\t" | |
437 "movq %%mm4, 8(%2) \n\t" | |
438 "movq %%mm1, (%2, %3) \n\t" | |
439 "movq %%mm5, 8(%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
440 "add %%"REG_a", %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
441 "add %%"REG_a", %2 \n\t" |
651 | 442 "subl $4, %0 \n\t" |
443 "jnz 1b \n\t" | |
444 : "+g"(h), "+r" (pixels), "+r" (block) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
445 : "r"((long)line_size) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
446 : "%"REG_a, "memory" |
651 | 447 ); |
448 } | |
449 | |
296 | 450 static void clear_blocks_mmx(DCTELEM *blocks) |
451 { | |
471 | 452 __asm __volatile( |
296 | 453 "pxor %%mm7, %%mm7 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
454 "mov $-128*6, %%"REG_a" \n\t" |
296 | 455 "1: \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
456 "movq %%mm7, (%0, %%"REG_a") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
457 "movq %%mm7, 8(%0, %%"REG_a") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
458 "movq %%mm7, 16(%0, %%"REG_a") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
459 "movq %%mm7, 24(%0, %%"REG_a") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
460 "add $32, %%"REG_a" \n\t" |
296 | 461 " js 1b \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
462 : : "r" (((uint8_t *)blocks)+128*6) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
463 : "%"REG_a |
296 | 464 ); |
465 } | |
466 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
467 #ifdef CONFIG_ENCODERS |
1064 | 468 static int pix_sum16_mmx(uint8_t * pix, int line_size){ |
688 | 469 const int h=16; |
470 int sum; | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
471 long index= -line_size*h; |
688 | 472 |
473 __asm __volatile( | |
474 "pxor %%mm7, %%mm7 \n\t" | |
475 "pxor %%mm6, %%mm6 \n\t" | |
476 "1: \n\t" | |
477 "movq (%2, %1), %%mm0 \n\t" | |
478 "movq (%2, %1), %%mm1 \n\t" | |
479 "movq 8(%2, %1), %%mm2 \n\t" | |
480 "movq 8(%2, %1), %%mm3 \n\t" | |
481 "punpcklbw %%mm7, %%mm0 \n\t" | |
482 "punpckhbw %%mm7, %%mm1 \n\t" | |
483 "punpcklbw %%mm7, %%mm2 \n\t" | |
484 "punpckhbw %%mm7, %%mm3 \n\t" | |
485 "paddw %%mm0, %%mm1 \n\t" | |
486 "paddw %%mm2, %%mm3 \n\t" | |
487 "paddw %%mm1, %%mm3 \n\t" | |
488 "paddw %%mm3, %%mm6 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
489 "add %3, %1 \n\t" |
688 | 490 " js 1b \n\t" |
491 "movq %%mm6, %%mm5 \n\t" | |
492 "psrlq $32, %%mm6 \n\t" | |
493 "paddw %%mm5, %%mm6 \n\t" | |
494 "movq %%mm6, %%mm5 \n\t" | |
495 "psrlq $16, %%mm6 \n\t" | |
496 "paddw %%mm5, %%mm6 \n\t" | |
497 "movd %%mm6, %0 \n\t" | |
498 "andl $0xFFFF, %0 \n\t" | |
499 : "=&r" (sum), "+r" (index) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
500 : "r" (pix - index), "r" ((long)line_size) |
688 | 501 ); |
502 | |
503 return sum; | |
504 } | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
505 #endif //CONFIG_ENCODERS |
688 | 506 |
866 | 507 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
508 long i=0; |
866 | 509 asm volatile( |
510 "1: \n\t" | |
511 "movq (%1, %0), %%mm0 \n\t" | |
512 "movq (%2, %0), %%mm1 \n\t" | |
513 "paddb %%mm0, %%mm1 \n\t" | |
514 "movq %%mm1, (%2, %0) \n\t" | |
515 "movq 8(%1, %0), %%mm0 \n\t" | |
516 "movq 8(%2, %0), %%mm1 \n\t" | |
517 "paddb %%mm0, %%mm1 \n\t" | |
518 "movq %%mm1, 8(%2, %0) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
519 "add $16, %0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
520 "cmp %3, %0 \n\t" |
866 | 521 " jb 1b \n\t" |
522 : "+r" (i) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
523 : "r"(src), "r"(dst), "r"((long)w-15) |
866 | 524 ); |
525 for(; i<w; i++) | |
526 dst[i+0] += src[i+0]; | |
527 } | |
528 | |
1648 | 529 #define H263_LOOP_FILTER \ |
530 "pxor %%mm7, %%mm7 \n\t"\ | |
531 "movq %0, %%mm0 \n\t"\ | |
532 "movq %0, %%mm1 \n\t"\ | |
533 "movq %3, %%mm2 \n\t"\ | |
534 "movq %3, %%mm3 \n\t"\ | |
535 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
536 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
537 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
538 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
539 "psubw %%mm2, %%mm0 \n\t"\ | |
540 "psubw %%mm3, %%mm1 \n\t"\ | |
541 "movq %1, %%mm2 \n\t"\ | |
542 "movq %1, %%mm3 \n\t"\ | |
543 "movq %2, %%mm4 \n\t"\ | |
544 "movq %2, %%mm5 \n\t"\ | |
545 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
546 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
547 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
548 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
549 "psubw %%mm2, %%mm4 \n\t"\ | |
550 "psubw %%mm3, %%mm5 \n\t"\ | |
551 "psllw $2, %%mm4 \n\t"\ | |
552 "psllw $2, %%mm5 \n\t"\ | |
553 "paddw %%mm0, %%mm4 \n\t"\ | |
554 "paddw %%mm1, %%mm5 \n\t"\ | |
555 "pxor %%mm6, %%mm6 \n\t"\ | |
556 "pcmpgtw %%mm4, %%mm6 \n\t"\ | |
557 "pcmpgtw %%mm5, %%mm7 \n\t"\ | |
558 "pxor %%mm6, %%mm4 \n\t"\ | |
559 "pxor %%mm7, %%mm5 \n\t"\ | |
560 "psubw %%mm6, %%mm4 \n\t"\ | |
561 "psubw %%mm7, %%mm5 \n\t"\ | |
562 "psrlw $3, %%mm4 \n\t"\ | |
563 "psrlw $3, %%mm5 \n\t"\ | |
564 "packuswb %%mm5, %%mm4 \n\t"\ | |
565 "packsswb %%mm7, %%mm6 \n\t"\ | |
566 "pxor %%mm7, %%mm7 \n\t"\ | |
567 "movd %4, %%mm2 \n\t"\ | |
568 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
569 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
570 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
571 "psubusb %%mm4, %%mm2 \n\t"\ | |
572 "movq %%mm2, %%mm3 \n\t"\ | |
573 "psubusb %%mm4, %%mm3 \n\t"\ | |
574 "psubb %%mm3, %%mm2 \n\t"\ | |
575 "movq %1, %%mm3 \n\t"\ | |
576 "movq %2, %%mm4 \n\t"\ | |
577 "pxor %%mm6, %%mm3 \n\t"\ | |
578 "pxor %%mm6, %%mm4 \n\t"\ | |
579 "paddusb %%mm2, %%mm3 \n\t"\ | |
580 "psubusb %%mm2, %%mm4 \n\t"\ | |
581 "pxor %%mm6, %%mm3 \n\t"\ | |
582 "pxor %%mm6, %%mm4 \n\t"\ | |
583 "paddusb %%mm2, %%mm2 \n\t"\ | |
584 "packsswb %%mm1, %%mm0 \n\t"\ | |
585 "pcmpgtb %%mm0, %%mm7 \n\t"\ | |
586 "pxor %%mm7, %%mm0 \n\t"\ | |
587 "psubb %%mm7, %%mm0 \n\t"\ | |
588 "movq %%mm0, %%mm1 \n\t"\ | |
589 "psubusb %%mm2, %%mm0 \n\t"\ | |
590 "psubb %%mm0, %%mm1 \n\t"\ | |
591 "pand %5, %%mm1 \n\t"\ | |
592 "psrlw $2, %%mm1 \n\t"\ | |
593 "pxor %%mm7, %%mm1 \n\t"\ | |
594 "psubb %%mm7, %%mm1 \n\t"\ | |
595 "movq %0, %%mm5 \n\t"\ | |
596 "movq %3, %%mm6 \n\t"\ | |
597 "psubb %%mm1, %%mm5 \n\t"\ | |
598 "paddb %%mm1, %%mm6 \n\t" | |
599 | |
1647 | 600 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ |
601 const int strength= ff_h263_loop_filter_strength[qscale]; | |
602 | |
603 asm volatile( | |
1648 | 604 |
605 H263_LOOP_FILTER | |
606 | |
1647 | 607 "movq %%mm3, %1 \n\t" |
608 "movq %%mm4, %2 \n\t" | |
1648 | 609 "movq %%mm5, %0 \n\t" |
610 "movq %%mm6, %3 \n\t" | |
1647 | 611 : "+m" (*(uint64_t*)(src - 2*stride)), |
612 "+m" (*(uint64_t*)(src - 1*stride)), | |
613 "+m" (*(uint64_t*)(src + 0*stride)), | |
614 "+m" (*(uint64_t*)(src + 1*stride)) | |
615 : "g" (2*strength), "m"(ff_pb_FC) | |
616 ); | |
617 } | |
618 | |
1648 | 619 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ |
620 asm volatile( //FIXME could save 1 instruction if done as 8x4 ... | |
621 "movd %4, %%mm0 \n\t" | |
622 "movd %5, %%mm1 \n\t" | |
623 "movd %6, %%mm2 \n\t" | |
624 "movd %7, %%mm3 \n\t" | |
625 "punpcklbw %%mm1, %%mm0 \n\t" | |
626 "punpcklbw %%mm3, %%mm2 \n\t" | |
627 "movq %%mm0, %%mm1 \n\t" | |
628 "punpcklwd %%mm2, %%mm0 \n\t" | |
629 "punpckhwd %%mm2, %%mm1 \n\t" | |
630 "movd %%mm0, %0 \n\t" | |
631 "punpckhdq %%mm0, %%mm0 \n\t" | |
632 "movd %%mm0, %1 \n\t" | |
633 "movd %%mm1, %2 \n\t" | |
634 "punpckhdq %%mm1, %%mm1 \n\t" | |
635 "movd %%mm1, %3 \n\t" | |
636 | |
637 : "=m" (*(uint32_t*)(dst + 0*dst_stride)), | |
638 "=m" (*(uint32_t*)(dst + 1*dst_stride)), | |
639 "=m" (*(uint32_t*)(dst + 2*dst_stride)), | |
640 "=m" (*(uint32_t*)(dst + 3*dst_stride)) | |
641 : "m" (*(uint32_t*)(src + 0*src_stride)), | |
642 "m" (*(uint32_t*)(src + 1*src_stride)), | |
643 "m" (*(uint32_t*)(src + 2*src_stride)), | |
644 "m" (*(uint32_t*)(src + 3*src_stride)) | |
645 ); | |
646 } | |
647 | |
648 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ | |
649 const int strength= ff_h263_loop_filter_strength[qscale]; | |
650 uint64_t temp[4] __attribute__ ((aligned(8))); | |
651 uint8_t *btemp= (uint8_t*)temp; | |
652 | |
653 src -= 2; | |
654 | |
655 transpose4x4(btemp , src , 8, stride); | |
656 transpose4x4(btemp+4, src + 4*stride, 8, stride); | |
657 asm volatile( | |
658 H263_LOOP_FILTER // 5 3 4 6 | |
659 | |
660 : "+m" (temp[0]), | |
661 "+m" (temp[1]), | |
662 "+m" (temp[2]), | |
663 "+m" (temp[3]) | |
664 : "g" (2*strength), "m"(ff_pb_FC) | |
665 ); | |
666 | |
667 asm volatile( | |
668 "movq %%mm5, %%mm1 \n\t" | |
669 "movq %%mm4, %%mm0 \n\t" | |
670 "punpcklbw %%mm3, %%mm5 \n\t" | |
671 "punpcklbw %%mm6, %%mm4 \n\t" | |
672 "punpckhbw %%mm3, %%mm1 \n\t" | |
673 "punpckhbw %%mm6, %%mm0 \n\t" | |
674 "movq %%mm5, %%mm3 \n\t" | |
675 "movq %%mm1, %%mm6 \n\t" | |
676 "punpcklwd %%mm4, %%mm5 \n\t" | |
677 "punpcklwd %%mm0, %%mm1 \n\t" | |
678 "punpckhwd %%mm4, %%mm3 \n\t" | |
679 "punpckhwd %%mm0, %%mm6 \n\t" | |
2505
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
680 "movd %%mm5, (%0) \n\t" |
1648 | 681 "punpckhdq %%mm5, %%mm5 \n\t" |
2505
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
682 "movd %%mm5, (%0,%2) \n\t" |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
683 "movd %%mm3, (%0,%2,2) \n\t" |
1648 | 684 "punpckhdq %%mm3, %%mm3 \n\t" |
2505
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
685 "movd %%mm3, (%0,%3) \n\t" |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
686 "movd %%mm1, (%1) \n\t" |
1648 | 687 "punpckhdq %%mm1, %%mm1 \n\t" |
2505
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
688 "movd %%mm1, (%1,%2) \n\t" |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
689 "movd %%mm6, (%1,%2,2) \n\t" |
1648 | 690 "punpckhdq %%mm6, %%mm6 \n\t" |
2505
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
691 "movd %%mm6, (%1,%3) \n\t" |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
692 :: "r" (src), |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
693 "r" (src + 4*stride), |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
694 "r" ((long) stride ), |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
695 "r" ((long)(3*stride)) |
1648 | 696 ); |
697 } | |
698 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
699 #ifdef CONFIG_ENCODERS |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
700 static int pix_norm1_mmx(uint8_t *pix, int line_size) { |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
701 int tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
702 asm volatile ( |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
703 "movl $16,%%ecx\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
704 "pxor %%mm0,%%mm0\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
705 "pxor %%mm7,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
706 "1:\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
707 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
708 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
709 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
710 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
711 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
712 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
713 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
714 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
715 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
716 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
717 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
718 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
719 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
720 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
721 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
722 "pmaddwd %%mm3,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
723 "pmaddwd %%mm4,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
724 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
725 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
726 pix2^2+pix3^2+pix6^2+pix7^2) */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
727 "paddd %%mm3,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
728 "paddd %%mm2,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
729 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
730 "add %2, %0\n" |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
731 "paddd %%mm4,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
732 "dec %%ecx\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
733 "jnz 1b\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
734 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
735 "movq %%mm7,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
736 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
737 "paddd %%mm7,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
738 "movd %%mm1,%1\n" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
739 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" ); |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
740 return tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
741 } |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
742 |
2067 | 743 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
744 int tmp; | |
745 asm volatile ( | |
746 "movl %4,%%ecx\n" | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
747 "shr $1,%%ecx\n" |
2067 | 748 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ |
749 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ | |
750 "1:\n" | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
751 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
752 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
753 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
754 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */ |
2067 | 755 |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
756 /* todo: mm1-mm2, mm3-mm4 */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
757 /* algo: substract mm1 from mm2 with saturation and vice versa */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
758 /* OR the results to get absolute difference */ |
2067 | 759 "movq %%mm1,%%mm5\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
760 "movq %%mm3,%%mm6\n" |
2067 | 761 "psubusb %%mm2,%%mm1\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
762 "psubusb %%mm4,%%mm3\n" |
2067 | 763 "psubusb %%mm5,%%mm2\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
764 "psubusb %%mm6,%%mm4\n" |
2067 | 765 |
766 "por %%mm1,%%mm2\n" | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
767 "por %%mm3,%%mm4\n" |
2067 | 768 |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
769 /* now convert to 16-bit vectors so we can square them */ |
2067 | 770 "movq %%mm2,%%mm1\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
771 "movq %%mm4,%%mm3\n" |
2067 | 772 |
773 "punpckhbw %%mm0,%%mm2\n" | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
774 "punpckhbw %%mm0,%%mm4\n" |
2067 | 775 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
776 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ |
2067 | 777 |
778 "pmaddwd %%mm2,%%mm2\n" | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
779 "pmaddwd %%mm4,%%mm4\n" |
2067 | 780 "pmaddwd %%mm1,%%mm1\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
781 "pmaddwd %%mm3,%%mm3\n" |
2067 | 782 |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
783 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
784 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ |
2067 | 785 |
786 "paddd %%mm2,%%mm1\n" | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
787 "paddd %%mm4,%%mm3\n" |
2067 | 788 "paddd %%mm1,%%mm7\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
789 "paddd %%mm3,%%mm7\n" |
2067 | 790 |
791 "decl %%ecx\n" | |
792 "jnz 1b\n" | |
793 | |
794 "movq %%mm7,%%mm1\n" | |
795 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ | |
796 "paddd %%mm7,%%mm1\n" | |
797 "movd %%mm1,%2\n" | |
798 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
799 : "r" ((long)line_size) , "m" (h) |
2067 | 800 : "%ecx"); |
801 return tmp; | |
802 } | |
803 | |
1708 | 804 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
805 int tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
806 asm volatile ( |
1708 | 807 "movl %4,%%ecx\n" |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
808 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
809 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
810 "1:\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
811 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
812 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
813 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
814 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
815 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
816 /* todo: mm1-mm2, mm3-mm4 */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
817 /* algo: substract mm1 from mm2 with saturation and vice versa */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
818 /* OR the results to get absolute difference */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
819 "movq %%mm1,%%mm5\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
820 "movq %%mm3,%%mm6\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
821 "psubusb %%mm2,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
822 "psubusb %%mm4,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
823 "psubusb %%mm5,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
824 "psubusb %%mm6,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
825 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
826 "por %%mm1,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
827 "por %%mm3,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
828 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
829 /* now convert to 16-bit vectors so we can square them */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
830 "movq %%mm2,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
831 "movq %%mm4,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
832 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
833 "punpckhbw %%mm0,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
834 "punpckhbw %%mm0,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
835 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
836 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
837 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
838 "pmaddwd %%mm2,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
839 "pmaddwd %%mm4,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
840 "pmaddwd %%mm1,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
841 "pmaddwd %%mm3,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
842 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
843 "add %3,%0\n" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
844 "add %3,%1\n" |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
845 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
846 "paddd %%mm2,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
847 "paddd %%mm4,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
848 "paddd %%mm1,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
849 "paddd %%mm3,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
850 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
851 "decl %%ecx\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
852 "jnz 1b\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
853 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
854 "movq %%mm7,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
855 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
856 "paddd %%mm7,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
857 "movd %%mm1,%2\n" |
1708 | 858 : "+r" (pix1), "+r" (pix2), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
859 : "r" ((long)line_size) , "m" (h) |
1708 | 860 : "%ecx"); |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
861 return tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
862 } |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
863 |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
864 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
865 int tmp; |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
866 asm volatile ( |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
867 "shr $1,%2\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
868 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
869 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
870 "1:\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
871 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
872 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
873 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
874 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
875 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
876 /* todo: mm1-mm2, mm3-mm4 */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
877 /* algo: substract mm1 from mm2 with saturation and vice versa */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
878 /* OR the results to get absolute difference */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
879 "movdqa %%xmm1,%%xmm5\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
880 "movdqa %%xmm3,%%xmm6\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
881 "psubusb %%xmm2,%%xmm1\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
882 "psubusb %%xmm4,%%xmm3\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
883 "psubusb %%xmm5,%%xmm2\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
884 "psubusb %%xmm6,%%xmm4\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
885 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
886 "por %%xmm1,%%xmm2\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
887 "por %%xmm3,%%xmm4\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
888 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
889 /* now convert to 16-bit vectors so we can square them */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
890 "movdqa %%xmm2,%%xmm1\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
891 "movdqa %%xmm4,%%xmm3\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
892 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
893 "punpckhbw %%xmm0,%%xmm2\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
894 "punpckhbw %%xmm0,%%xmm4\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
895 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
896 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
897 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
898 "pmaddwd %%xmm2,%%xmm2\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
899 "pmaddwd %%xmm4,%%xmm4\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
900 "pmaddwd %%xmm1,%%xmm1\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
901 "pmaddwd %%xmm3,%%xmm3\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
902 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
903 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
904 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
905 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
906 "paddd %%xmm2,%%xmm1\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
907 "paddd %%xmm4,%%xmm3\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
908 "paddd %%xmm1,%%xmm7\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
909 "paddd %%xmm3,%%xmm7\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
910 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
911 "decl %2\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
912 "jnz 1b\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
913 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
914 "movdqa %%xmm7,%%xmm1\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
915 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
916 "paddd %%xmm1,%%xmm7\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
917 "movdqa %%xmm7,%%xmm1\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
918 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
919 "paddd %%xmm1,%%xmm7\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
920 "movd %%xmm7,%3\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
921 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
922 : "r" ((long)line_size)); |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
923 return tmp; |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
924 } |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
925 |
2067 | 926 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { |
927 int tmp; | |
928 asm volatile ( | |
929 "movl %3,%%ecx\n" | |
930 "pxor %%mm7,%%mm7\n" | |
931 "pxor %%mm6,%%mm6\n" | |
932 | |
933 "movq (%0),%%mm0\n" | |
934 "movq %%mm0, %%mm1\n" | |
935 "psllq $8, %%mm0\n" | |
936 "psrlq $8, %%mm1\n" | |
937 "psrlq $8, %%mm0\n" | |
938 "movq %%mm0, %%mm2\n" | |
939 "movq %%mm1, %%mm3\n" | |
940 "punpcklbw %%mm7,%%mm0\n" | |
941 "punpcklbw %%mm7,%%mm1\n" | |
942 "punpckhbw %%mm7,%%mm2\n" | |
943 "punpckhbw %%mm7,%%mm3\n" | |
944 "psubw %%mm1, %%mm0\n" | |
945 "psubw %%mm3, %%mm2\n" | |
946 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
947 "add %2,%0\n" |
2067 | 948 |
949 "movq (%0),%%mm4\n" | |
950 "movq %%mm4, %%mm1\n" | |
951 "psllq $8, %%mm4\n" | |
952 "psrlq $8, %%mm1\n" | |
953 "psrlq $8, %%mm4\n" | |
954 "movq %%mm4, %%mm5\n" | |
955 "movq %%mm1, %%mm3\n" | |
956 "punpcklbw %%mm7,%%mm4\n" | |
957 "punpcklbw %%mm7,%%mm1\n" | |
958 "punpckhbw %%mm7,%%mm5\n" | |
959 "punpckhbw %%mm7,%%mm3\n" | |
960 "psubw %%mm1, %%mm4\n" | |
961 "psubw %%mm3, %%mm5\n" | |
962 "psubw %%mm4, %%mm0\n" | |
963 "psubw %%mm5, %%mm2\n" | |
964 "pxor %%mm3, %%mm3\n" | |
965 "pxor %%mm1, %%mm1\n" | |
966 "pcmpgtw %%mm0, %%mm3\n\t" | |
967 "pcmpgtw %%mm2, %%mm1\n\t" | |
968 "pxor %%mm3, %%mm0\n" | |
969 "pxor %%mm1, %%mm2\n" | |
970 "psubw %%mm3, %%mm0\n" | |
971 "psubw %%mm1, %%mm2\n" | |
972 "paddw %%mm0, %%mm2\n" | |
973 "paddw %%mm2, %%mm6\n" | |
974 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
975 "add %2,%0\n" |
2067 | 976 "1:\n" |
977 | |
978 "movq (%0),%%mm0\n" | |
979 "movq %%mm0, %%mm1\n" | |
980 "psllq $8, %%mm0\n" | |
981 "psrlq $8, %%mm1\n" | |
982 "psrlq $8, %%mm0\n" | |
983 "movq %%mm0, %%mm2\n" | |
984 "movq %%mm1, %%mm3\n" | |
985 "punpcklbw %%mm7,%%mm0\n" | |
986 "punpcklbw %%mm7,%%mm1\n" | |
987 "punpckhbw %%mm7,%%mm2\n" | |
988 "punpckhbw %%mm7,%%mm3\n" | |
989 "psubw %%mm1, %%mm0\n" | |
990 "psubw %%mm3, %%mm2\n" | |
991 "psubw %%mm0, %%mm4\n" | |
992 "psubw %%mm2, %%mm5\n" | |
993 "pxor %%mm3, %%mm3\n" | |
994 "pxor %%mm1, %%mm1\n" | |
995 "pcmpgtw %%mm4, %%mm3\n\t" | |
996 "pcmpgtw %%mm5, %%mm1\n\t" | |
997 "pxor %%mm3, %%mm4\n" | |
998 "pxor %%mm1, %%mm5\n" | |
999 "psubw %%mm3, %%mm4\n" | |
1000 "psubw %%mm1, %%mm5\n" | |
1001 "paddw %%mm4, %%mm5\n" | |
1002 "paddw %%mm5, %%mm6\n" | |
1003 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1004 "add %2,%0\n" |
2067 | 1005 |
1006 "movq (%0),%%mm4\n" | |
1007 "movq %%mm4, %%mm1\n" | |
1008 "psllq $8, %%mm4\n" | |
1009 "psrlq $8, %%mm1\n" | |
1010 "psrlq $8, %%mm4\n" | |
1011 "movq %%mm4, %%mm5\n" | |
1012 "movq %%mm1, %%mm3\n" | |
1013 "punpcklbw %%mm7,%%mm4\n" | |
1014 "punpcklbw %%mm7,%%mm1\n" | |
1015 "punpckhbw %%mm7,%%mm5\n" | |
1016 "punpckhbw %%mm7,%%mm3\n" | |
1017 "psubw %%mm1, %%mm4\n" | |
1018 "psubw %%mm3, %%mm5\n" | |
1019 "psubw %%mm4, %%mm0\n" | |
1020 "psubw %%mm5, %%mm2\n" | |
1021 "pxor %%mm3, %%mm3\n" | |
1022 "pxor %%mm1, %%mm1\n" | |
1023 "pcmpgtw %%mm0, %%mm3\n\t" | |
1024 "pcmpgtw %%mm2, %%mm1\n\t" | |
1025 "pxor %%mm3, %%mm0\n" | |
1026 "pxor %%mm1, %%mm2\n" | |
1027 "psubw %%mm3, %%mm0\n" | |
1028 "psubw %%mm1, %%mm2\n" | |
1029 "paddw %%mm0, %%mm2\n" | |
1030 "paddw %%mm2, %%mm6\n" | |
1031 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1032 "add %2,%0\n" |
2067 | 1033 "subl $2, %%ecx\n" |
1034 " jnz 1b\n" | |
1035 | |
1036 "movq %%mm6, %%mm0\n" | |
1037 "punpcklwd %%mm7,%%mm0\n" | |
1038 "punpckhwd %%mm7,%%mm6\n" | |
1039 "paddd %%mm0, %%mm6\n" | |
1040 | |
1041 "movq %%mm6,%%mm0\n" | |
1042 "psrlq $32, %%mm6\n" | |
1043 "paddd %%mm6,%%mm0\n" | |
1044 "movd %%mm0,%1\n" | |
1045 : "+r" (pix1), "=r"(tmp) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1046 : "r" ((long)line_size) , "g" (h-2) |
2067 | 1047 : "%ecx"); |
1048 return tmp; | |
1049 } | |
1050 | |
1051 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { | |
1052 int tmp; | |
1053 uint8_t * pix= pix1; | |
1054 asm volatile ( | |
1055 "movl %3,%%ecx\n" | |
1056 "pxor %%mm7,%%mm7\n" | |
1057 "pxor %%mm6,%%mm6\n" | |
1058 | |
1059 "movq (%0),%%mm0\n" | |
1060 "movq 1(%0),%%mm1\n" | |
1061 "movq %%mm0, %%mm2\n" | |
1062 "movq %%mm1, %%mm3\n" | |
1063 "punpcklbw %%mm7,%%mm0\n" | |
1064 "punpcklbw %%mm7,%%mm1\n" | |
1065 "punpckhbw %%mm7,%%mm2\n" | |
1066 "punpckhbw %%mm7,%%mm3\n" | |
1067 "psubw %%mm1, %%mm0\n" | |
1068 "psubw %%mm3, %%mm2\n" | |
1069 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1070 "add %2,%0\n" |
2067 | 1071 |
1072 "movq (%0),%%mm4\n" | |
1073 "movq 1(%0),%%mm1\n" | |
1074 "movq %%mm4, %%mm5\n" | |
1075 "movq %%mm1, %%mm3\n" | |
1076 "punpcklbw %%mm7,%%mm4\n" | |
1077 "punpcklbw %%mm7,%%mm1\n" | |
1078 "punpckhbw %%mm7,%%mm5\n" | |
1079 "punpckhbw %%mm7,%%mm3\n" | |
1080 "psubw %%mm1, %%mm4\n" | |
1081 "psubw %%mm3, %%mm5\n" | |
1082 "psubw %%mm4, %%mm0\n" | |
1083 "psubw %%mm5, %%mm2\n" | |
1084 "pxor %%mm3, %%mm3\n" | |
1085 "pxor %%mm1, %%mm1\n" | |
1086 "pcmpgtw %%mm0, %%mm3\n\t" | |
1087 "pcmpgtw %%mm2, %%mm1\n\t" | |
1088 "pxor %%mm3, %%mm0\n" | |
1089 "pxor %%mm1, %%mm2\n" | |
1090 "psubw %%mm3, %%mm0\n" | |
1091 "psubw %%mm1, %%mm2\n" | |
1092 "paddw %%mm0, %%mm2\n" | |
1093 "paddw %%mm2, %%mm6\n" | |
1094 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1095 "add %2,%0\n" |
2067 | 1096 "1:\n" |
1097 | |
1098 "movq (%0),%%mm0\n" | |
1099 "movq 1(%0),%%mm1\n" | |
1100 "movq %%mm0, %%mm2\n" | |
1101 "movq %%mm1, %%mm3\n" | |
1102 "punpcklbw %%mm7,%%mm0\n" | |
1103 "punpcklbw %%mm7,%%mm1\n" | |
1104 "punpckhbw %%mm7,%%mm2\n" | |
1105 "punpckhbw %%mm7,%%mm3\n" | |
1106 "psubw %%mm1, %%mm0\n" | |
1107 "psubw %%mm3, %%mm2\n" | |
1108 "psubw %%mm0, %%mm4\n" | |
1109 "psubw %%mm2, %%mm5\n" | |
1110 "pxor %%mm3, %%mm3\n" | |
1111 "pxor %%mm1, %%mm1\n" | |
1112 "pcmpgtw %%mm4, %%mm3\n\t" | |
1113 "pcmpgtw %%mm5, %%mm1\n\t" | |
1114 "pxor %%mm3, %%mm4\n" | |
1115 "pxor %%mm1, %%mm5\n" | |
1116 "psubw %%mm3, %%mm4\n" | |
1117 "psubw %%mm1, %%mm5\n" | |
1118 "paddw %%mm4, %%mm5\n" | |
1119 "paddw %%mm5, %%mm6\n" | |
1120 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1121 "add %2,%0\n" |
2067 | 1122 |
1123 "movq (%0),%%mm4\n" | |
1124 "movq 1(%0),%%mm1\n" | |
1125 "movq %%mm4, %%mm5\n" | |
1126 "movq %%mm1, %%mm3\n" | |
1127 "punpcklbw %%mm7,%%mm4\n" | |
1128 "punpcklbw %%mm7,%%mm1\n" | |
1129 "punpckhbw %%mm7,%%mm5\n" | |
1130 "punpckhbw %%mm7,%%mm3\n" | |
1131 "psubw %%mm1, %%mm4\n" | |
1132 "psubw %%mm3, %%mm5\n" | |
1133 "psubw %%mm4, %%mm0\n" | |
1134 "psubw %%mm5, %%mm2\n" | |
1135 "pxor %%mm3, %%mm3\n" | |
1136 "pxor %%mm1, %%mm1\n" | |
1137 "pcmpgtw %%mm0, %%mm3\n\t" | |
1138 "pcmpgtw %%mm2, %%mm1\n\t" | |
1139 "pxor %%mm3, %%mm0\n" | |
1140 "pxor %%mm1, %%mm2\n" | |
1141 "psubw %%mm3, %%mm0\n" | |
1142 "psubw %%mm1, %%mm2\n" | |
1143 "paddw %%mm0, %%mm2\n" | |
1144 "paddw %%mm2, %%mm6\n" | |
1145 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1146 "add %2,%0\n" |
2067 | 1147 "subl $2, %%ecx\n" |
1148 " jnz 1b\n" | |
1149 | |
1150 "movq %%mm6, %%mm0\n" | |
1151 "punpcklwd %%mm7,%%mm0\n" | |
1152 "punpckhwd %%mm7,%%mm6\n" | |
1153 "paddd %%mm0, %%mm6\n" | |
1154 | |
1155 "movq %%mm6,%%mm0\n" | |
1156 "psrlq $32, %%mm6\n" | |
1157 "paddd %%mm6,%%mm0\n" | |
1158 "movd %%mm0,%1\n" | |
1159 : "+r" (pix1), "=r"(tmp) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1160 : "r" ((long)line_size) , "g" (h-2) |
2067 | 1161 : "%ecx"); |
1162 return tmp + hf_noise8_mmx(pix+8, line_size, h); | |
1163 } | |
1164 | |
2864
95bac7109ff0
Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents:
2754
diff
changeset
|
1165 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
95bac7109ff0
Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents:
2754
diff
changeset
|
1166 MpegEncContext *c = p; |
2067 | 1167 int score1= sse16_mmx(c, pix1, pix2, line_size, h); |
1168 int score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); | |
1169 | |
1170 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight; | |
1171 else return score1 + ABS(score2)*8; | |
1172 } | |
1173 | |
2864
95bac7109ff0
Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents:
2754
diff
changeset
|
1174 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
95bac7109ff0
Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents:
2754
diff
changeset
|
1175 MpegEncContext *c = p; |
2067 | 1176 int score1= sse8_mmx(c, pix1, pix2, line_size, h); |
1177 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); | |
1178 | |
1179 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight; | |
1180 else return score1 + ABS(score2)*8; | |
1181 } | |
1182 | |
1729 | 1183 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { |
1184 int tmp; | |
1185 | |
1186 assert( (((int)pix) & 7) == 0); | |
1187 assert((line_size &7) ==0); | |
1188 | |
1189 #define SUM(in0, in1, out0, out1) \ | |
1190 "movq (%0), %%mm2\n"\ | |
1191 "movq 8(%0), %%mm3\n"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1192 "add %2,%0\n"\ |
1729 | 1193 "movq %%mm2, " #out0 "\n"\ |
1194 "movq %%mm3, " #out1 "\n"\ | |
1195 "psubusb " #in0 ", %%mm2\n"\ | |
1196 "psubusb " #in1 ", %%mm3\n"\ | |
1197 "psubusb " #out0 ", " #in0 "\n"\ | |
1198 "psubusb " #out1 ", " #in1 "\n"\ | |
1199 "por %%mm2, " #in0 "\n"\ | |
1200 "por %%mm3, " #in1 "\n"\ | |
1201 "movq " #in0 ", %%mm2\n"\ | |
1202 "movq " #in1 ", %%mm3\n"\ | |
1203 "punpcklbw %%mm7, " #in0 "\n"\ | |
1204 "punpcklbw %%mm7, " #in1 "\n"\ | |
1205 "punpckhbw %%mm7, %%mm2\n"\ | |
1206 "punpckhbw %%mm7, %%mm3\n"\ | |
1207 "paddw " #in1 ", " #in0 "\n"\ | |
1208 "paddw %%mm3, %%mm2\n"\ | |
1209 "paddw %%mm2, " #in0 "\n"\ | |
1210 "paddw " #in0 ", %%mm6\n" | |
1211 | |
1212 | |
1213 asm volatile ( | |
1214 "movl %3,%%ecx\n" | |
1215 "pxor %%mm6,%%mm6\n" | |
1216 "pxor %%mm7,%%mm7\n" | |
1217 "movq (%0),%%mm0\n" | |
1218 "movq 8(%0),%%mm1\n" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1219 "add %2,%0\n" |
1729 | 1220 "subl $2, %%ecx\n" |
1221 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1222 "1:\n" | |
1223 | |
1224 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
1225 | |
1226 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1227 | |
1228 "subl $2, %%ecx\n" | |
1229 "jnz 1b\n" | |
1230 | |
1231 "movq %%mm6,%%mm0\n" | |
1232 "psrlq $32, %%mm6\n" | |
1233 "paddw %%mm6,%%mm0\n" | |
1234 "movq %%mm0,%%mm6\n" | |
1235 "psrlq $16, %%mm0\n" | |
1236 "paddw %%mm6,%%mm0\n" | |
1237 "movd %%mm0,%1\n" | |
1238 : "+r" (pix), "=r"(tmp) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1239 : "r" ((long)line_size) , "m" (h) |
1729 | 1240 : "%ecx"); |
1241 return tmp & 0xFFFF; | |
1242 } | |
1243 #undef SUM | |
1244 | |
1245 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { | |
1246 int tmp; | |
1247 | |
1248 assert( (((int)pix) & 7) == 0); | |
1249 assert((line_size &7) ==0); | |
1250 | |
1251 #define SUM(in0, in1, out0, out1) \ | |
1252 "movq (%0), " #out0 "\n"\ | |
1253 "movq 8(%0), " #out1 "\n"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1254 "add %2,%0\n"\ |
1729 | 1255 "psadbw " #out0 ", " #in0 "\n"\ |
1256 "psadbw " #out1 ", " #in1 "\n"\ | |
1257 "paddw " #in1 ", " #in0 "\n"\ | |
1258 "paddw " #in0 ", %%mm6\n" | |
1259 | |
1260 asm volatile ( | |
1261 "movl %3,%%ecx\n" | |
1262 "pxor %%mm6,%%mm6\n" | |
1263 "pxor %%mm7,%%mm7\n" | |
1264 "movq (%0),%%mm0\n" | |
1265 "movq 8(%0),%%mm1\n" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1266 "add %2,%0\n" |
1729 | 1267 "subl $2, %%ecx\n" |
1268 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1269 "1:\n" | |
1270 | |
1271 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
1272 | |
1273 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1274 | |
1275 "subl $2, %%ecx\n" | |
1276 "jnz 1b\n" | |
1277 | |
1278 "movd %%mm6,%1\n" | |
1279 : "+r" (pix), "=r"(tmp) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1280 : "r" ((long)line_size) , "m" (h) |
1729 | 1281 : "%ecx"); |
1282 return tmp; | |
1283 } | |
1284 #undef SUM | |
1285 | |
1286 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
1287 int tmp; | |
1288 | |
1289 assert( (((int)pix1) & 7) == 0); | |
1290 assert( (((int)pix2) & 7) == 0); | |
1291 assert((line_size &7) ==0); | |
1292 | |
1293 #define SUM(in0, in1, out0, out1) \ | |
1294 "movq (%0),%%mm2\n"\ | |
1295 "movq (%1)," #out0 "\n"\ | |
1296 "movq 8(%0),%%mm3\n"\ | |
1297 "movq 8(%1)," #out1 "\n"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1298 "add %3,%0\n"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1299 "add %3,%1\n"\ |
1729 | 1300 "psubb " #out0 ", %%mm2\n"\ |
1301 "psubb " #out1 ", %%mm3\n"\ | |
1302 "pxor %%mm7, %%mm2\n"\ | |
1303 "pxor %%mm7, %%mm3\n"\ | |
1304 "movq %%mm2, " #out0 "\n"\ | |
1305 "movq %%mm3, " #out1 "\n"\ | |
1306 "psubusb " #in0 ", %%mm2\n"\ | |
1307 "psubusb " #in1 ", %%mm3\n"\ | |
1308 "psubusb " #out0 ", " #in0 "\n"\ | |
1309 "psubusb " #out1 ", " #in1 "\n"\ | |
1310 "por %%mm2, " #in0 "\n"\ | |
1311 "por %%mm3, " #in1 "\n"\ | |
1312 "movq " #in0 ", %%mm2\n"\ | |
1313 "movq " #in1 ", %%mm3\n"\ | |
1314 "punpcklbw %%mm7, " #in0 "\n"\ | |
1315 "punpcklbw %%mm7, " #in1 "\n"\ | |
1316 "punpckhbw %%mm7, %%mm2\n"\ | |
1317 "punpckhbw %%mm7, %%mm3\n"\ | |
1318 "paddw " #in1 ", " #in0 "\n"\ | |
1319 "paddw %%mm3, %%mm2\n"\ | |
1320 "paddw %%mm2, " #in0 "\n"\ | |
1321 "paddw " #in0 ", %%mm6\n" | |
1322 | |
1323 | |
1324 asm volatile ( | |
1325 "movl %4,%%ecx\n" | |
1326 "pxor %%mm6,%%mm6\n" | |
1327 "pcmpeqw %%mm7,%%mm7\n" | |
1328 "psllw $15, %%mm7\n" | |
1329 "packsswb %%mm7, %%mm7\n" | |
1330 "movq (%0),%%mm0\n" | |
1331 "movq (%1),%%mm2\n" | |
1332 "movq 8(%0),%%mm1\n" | |
1333 "movq 8(%1),%%mm3\n" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1334 "add %3,%0\n" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1335 "add %3,%1\n" |
1729 | 1336 "subl $2, %%ecx\n" |
1337 "psubb %%mm2, %%mm0\n" | |
1338 "psubb %%mm3, %%mm1\n" | |
1339 "pxor %%mm7, %%mm0\n" | |
1340 "pxor %%mm7, %%mm1\n" | |
1341 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1342 "1:\n" | |
1343 | |
1344 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
1345 | |
1346 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1347 | |
1348 "subl $2, %%ecx\n" | |
1349 "jnz 1b\n" | |
1350 | |
1351 "movq %%mm6,%%mm0\n" | |
1352 "psrlq $32, %%mm6\n" | |
1353 "paddw %%mm6,%%mm0\n" | |
1354 "movq %%mm0,%%mm6\n" | |
1355 "psrlq $16, %%mm0\n" | |
1356 "paddw %%mm6,%%mm0\n" | |
1357 "movd %%mm0,%2\n" | |
1358 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1359 : "r" ((long)line_size) , "m" (h) |
1729 | 1360 : "%ecx"); |
1361 return tmp & 0x7FFF; | |
1362 } | |
1363 #undef SUM | |
1364 | |
1365 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
1366 int tmp; | |
1367 | |
1368 assert( (((int)pix1) & 7) == 0); | |
1369 assert( (((int)pix2) & 7) == 0); | |
1370 assert((line_size &7) ==0); | |
1371 | |
1372 #define SUM(in0, in1, out0, out1) \ | |
1373 "movq (%0)," #out0 "\n"\ | |
1374 "movq (%1),%%mm2\n"\ | |
1375 "movq 8(%0)," #out1 "\n"\ | |
1376 "movq 8(%1),%%mm3\n"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1377 "add %3,%0\n"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1378 "add %3,%1\n"\ |
1729 | 1379 "psubb %%mm2, " #out0 "\n"\ |
1380 "psubb %%mm3, " #out1 "\n"\ | |
1381 "pxor %%mm7, " #out0 "\n"\ | |
1382 "pxor %%mm7, " #out1 "\n"\ | |
1383 "psadbw " #out0 ", " #in0 "\n"\ | |
1384 "psadbw " #out1 ", " #in1 "\n"\ | |
1385 "paddw " #in1 ", " #in0 "\n"\ | |
1386 "paddw " #in0 ", %%mm6\n" | |
1387 | |
1388 asm volatile ( | |
1389 "movl %4,%%ecx\n" | |
1390 "pxor %%mm6,%%mm6\n" | |
1391 "pcmpeqw %%mm7,%%mm7\n" | |
1392 "psllw $15, %%mm7\n" | |
1393 "packsswb %%mm7, %%mm7\n" | |
1394 "movq (%0),%%mm0\n" | |
1395 "movq (%1),%%mm2\n" | |
1396 "movq 8(%0),%%mm1\n" | |
1397 "movq 8(%1),%%mm3\n" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1398 "add %3,%0\n" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1399 "add %3,%1\n" |
1729 | 1400 "subl $2, %%ecx\n" |
1401 "psubb %%mm2, %%mm0\n" | |
1402 "psubb %%mm3, %%mm1\n" | |
1403 "pxor %%mm7, %%mm0\n" | |
1404 "pxor %%mm7, %%mm1\n" | |
1405 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1406 "1:\n" | |
1407 | |
1408 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
1409 | |
1410 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1411 | |
1412 "subl $2, %%ecx\n" | |
1413 "jnz 1b\n" | |
1414 | |
1415 "movd %%mm6,%2\n" | |
1416 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1417 : "r" ((long)line_size) , "m" (h) |
1729 | 1418 : "%ecx"); |
1419 return tmp; | |
1420 } | |
1421 #undef SUM | |
1422 | |
866 | 1423 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1424 long i=0; |
866 | 1425 asm volatile( |
1426 "1: \n\t" | |
1427 "movq (%2, %0), %%mm0 \n\t" | |
1428 "movq (%1, %0), %%mm1 \n\t" | |
1429 "psubb %%mm0, %%mm1 \n\t" | |
1430 "movq %%mm1, (%3, %0) \n\t" | |
1431 "movq 8(%2, %0), %%mm0 \n\t" | |
1432 "movq 8(%1, %0), %%mm1 \n\t" | |
1433 "psubb %%mm0, %%mm1 \n\t" | |
1434 "movq %%mm1, 8(%3, %0) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1435 "add $16, %0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1436 "cmp %4, %0 \n\t" |
866 | 1437 " jb 1b \n\t" |
1438 : "+r" (i) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1439 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15) |
866 | 1440 ); |
1441 for(; i<w; i++) | |
1442 dst[i+0] = src1[i+0]-src2[i+0]; | |
1443 } | |
1527 | 1444 |
1445 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1446 long i=0; |
1527 | 1447 uint8_t l, lt; |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
1448 |
1527 | 1449 asm volatile( |
1450 "1: \n\t" | |
1451 "movq -1(%1, %0), %%mm0 \n\t" // LT | |
1452 "movq (%1, %0), %%mm1 \n\t" // T | |
1453 "movq -1(%2, %0), %%mm2 \n\t" // L | |
1454 "movq (%2, %0), %%mm3 \n\t" // X | |
1455 "movq %%mm2, %%mm4 \n\t" // L | |
1456 "psubb %%mm0, %%mm2 \n\t" | |
1457 "paddb %%mm1, %%mm2 \n\t" // L + T - LT | |
1458 "movq %%mm4, %%mm5 \n\t" // L | |
1459 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) | |
1460 "pminub %%mm5, %%mm1 \n\t" // min(T, L) | |
1461 "pminub %%mm2, %%mm4 \n\t" | |
1462 "pmaxub %%mm1, %%mm4 \n\t" | |
1463 "psubb %%mm4, %%mm3 \n\t" // dst - pred | |
1464 "movq %%mm3, (%3, %0) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1465 "add $8, %0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1466 "cmp %4, %0 \n\t" |
1527 | 1467 " jb 1b \n\t" |
1468 : "+r" (i) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1469 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w) |
1527 | 1470 ); |
1471 | |
1472 l= *left; | |
1473 lt= *left_top; | |
1474 | |
1475 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); | |
1476 | |
1477 *left_top= src1[w-1]; | |
1478 *left = src2[w-1]; | |
1479 } | |
1480 | |
1153 | 1481 #define LBUTTERFLY2(a1,b1,a2,b2)\ |
1482 "paddw " #b1 ", " #a1 " \n\t"\ | |
1483 "paddw " #b2 ", " #a2 " \n\t"\ | |
1484 "paddw " #b1 ", " #b1 " \n\t"\ | |
1485 "paddw " #b2 ", " #b2 " \n\t"\ | |
1486 "psubw " #a1 ", " #b1 " \n\t"\ | |
1186 | 1487 "psubw " #a2 ", " #b2 " \n\t" |
866 | 1488 |
936 | 1489 #define HADAMARD48\ |
1153 | 1490 LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\ |
1491 LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\ | |
1492 LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\ | |
1493 LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\ | |
1494 LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\ | |
1495 LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\ | |
936 | 1496 |
1497 #define MMABS(a,z)\ | |
1498 "pxor " #z ", " #z " \n\t"\ | |
1499 "pcmpgtw " #a ", " #z " \n\t"\ | |
1500 "pxor " #z ", " #a " \n\t"\ | |
1501 "psubw " #z ", " #a " \n\t" | |
1502 | |
1503 #define MMABS_SUM(a,z, sum)\ | |
1504 "pxor " #z ", " #z " \n\t"\ | |
1505 "pcmpgtw " #a ", " #z " \n\t"\ | |
1506 "pxor " #z ", " #a " \n\t"\ | |
1507 "psubw " #z ", " #a " \n\t"\ | |
1508 "paddusw " #a ", " #sum " \n\t" | |
1509 | |
1153 | 1510 #define MMABS_MMX2(a,z)\ |
1511 "pxor " #z ", " #z " \n\t"\ | |
1512 "psubw " #a ", " #z " \n\t"\ | |
1513 "pmaxsw " #z ", " #a " \n\t" | |
1514 | |
1515 #define MMABS_SUM_MMX2(a,z, sum)\ | |
1516 "pxor " #z ", " #z " \n\t"\ | |
1517 "psubw " #a ", " #z " \n\t"\ | |
1518 "pmaxsw " #z ", " #a " \n\t"\ | |
1519 "paddusw " #a ", " #sum " \n\t" | |
1520 | |
936 | 1521 #define SBUTTERFLY(a,b,t,n)\ |
1522 "movq " #a ", " #t " \n\t" /* abcd */\ | |
1523 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ | |
1524 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ | |
1153 | 1525 |
936 | 1526 #define TRANSPOSE4(a,b,c,d,t)\ |
1527 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\ | |
1528 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\ | |
1529 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\ | |
1530 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */ | |
1531 | |
1532 #define LOAD4(o, a, b, c, d)\ | |
1533 "movq "#o"(%1), " #a " \n\t"\ | |
1534 "movq "#o"+16(%1), " #b " \n\t"\ | |
1535 "movq "#o"+32(%1), " #c " \n\t"\ | |
1536 "movq "#o"+48(%1), " #d " \n\t" | |
1537 | |
1538 #define STORE4(o, a, b, c, d)\ | |
1539 "movq "#a", "#o"(%1) \n\t"\ | |
1540 "movq "#b", "#o"+16(%1) \n\t"\ | |
1541 "movq "#c", "#o"+32(%1) \n\t"\ | |
1542 "movq "#d", "#o"+48(%1) \n\t"\ | |
1543 | |
1708 | 1544 static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ |
936 | 1545 uint64_t temp[16] __align8; |
1546 int sum=0; | |
1708 | 1547 |
1548 assert(h==8); | |
936 | 1549 |
1550 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); | |
1551 | |
1552 asm volatile( | |
1553 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) | |
1554 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) | |
1555 | |
1556 HADAMARD48 | |
1557 | |
1558 "movq %%mm7, 112(%1) \n\t" | |
1559 | |
1560 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) | |
1561 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) | |
1562 | |
1563 "movq 112(%1), %%mm7 \n\t" | |
1564 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) | |
1565 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) | |
1566 | |
1567 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) | |
1568 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) | |
1569 | |
1570 HADAMARD48 | |
1571 | |
1572 "movq %%mm7, 120(%1) \n\t" | |
1573 | |
1574 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) | |
1575 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2) | |
1576 | |
1577 "movq 120(%1), %%mm7 \n\t" | |
1578 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) | |
1579 "movq %%mm7, %%mm5 \n\t"//FIXME remove | |
1580 "movq %%mm6, %%mm7 \n\t" | |
1581 "movq %%mm0, %%mm6 \n\t" | |
1582 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove | |
1583 | |
1584 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3) | |
1585 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) | |
1586 | |
1587 HADAMARD48 | |
1588 "movq %%mm7, 64(%1) \n\t" | |
1589 MMABS(%%mm0, %%mm7) | |
1590 MMABS_SUM(%%mm1, %%mm7, %%mm0) | |
1591 MMABS_SUM(%%mm2, %%mm7, %%mm0) | |
1592 MMABS_SUM(%%mm3, %%mm7, %%mm0) | |
1593 MMABS_SUM(%%mm4, %%mm7, %%mm0) | |
1594 MMABS_SUM(%%mm5, %%mm7, %%mm0) | |
1595 MMABS_SUM(%%mm6, %%mm7, %%mm0) | |
1596 "movq 64(%1), %%mm1 \n\t" | |
1597 MMABS_SUM(%%mm1, %%mm7, %%mm0) | |
1598 "movq %%mm0, 64(%1) \n\t" | |
1599 | |
1600 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) | |
1601 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7) | |
1602 | |
1603 HADAMARD48 | |
1604 "movq %%mm7, (%1) \n\t" | |
1605 MMABS(%%mm0, %%mm7) | |
1606 MMABS_SUM(%%mm1, %%mm7, %%mm0) | |
1607 MMABS_SUM(%%mm2, %%mm7, %%mm0) | |
1608 MMABS_SUM(%%mm3, %%mm7, %%mm0) | |
1609 MMABS_SUM(%%mm4, %%mm7, %%mm0) | |
1610 MMABS_SUM(%%mm5, %%mm7, %%mm0) | |
1611 MMABS_SUM(%%mm6, %%mm7, %%mm0) | |
1612 "movq (%1), %%mm1 \n\t" | |
1613 MMABS_SUM(%%mm1, %%mm7, %%mm0) | |
1614 "movq 64(%1), %%mm1 \n\t" | |
1615 MMABS_SUM(%%mm1, %%mm7, %%mm0) | |
1616 | |
1617 "movq %%mm0, %%mm1 \n\t" | |
1618 "psrlq $32, %%mm0 \n\t" | |
1619 "paddusw %%mm1, %%mm0 \n\t" | |
1620 "movq %%mm0, %%mm1 \n\t" | |
1621 "psrlq $16, %%mm0 \n\t" | |
1622 "paddusw %%mm1, %%mm0 \n\t" | |
1623 "movd %%mm0, %0 \n\t" | |
1624 | |
1625 : "=r" (sum) | |
1626 : "r"(temp) | |
1627 ); | |
1628 return sum&0xFFFF; | |
1629 } | |
1630 | |
1708 | 1631 static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ |
1153 | 1632 uint64_t temp[16] __align8; |
1633 int sum=0; | |
1708 | 1634 |
1635 assert(h==8); | |
1153 | 1636 |
1637 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); | |
1638 | |
1639 asm volatile( | |
1640 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) | |
1641 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) | |
1642 | |
1643 HADAMARD48 | |
1644 | |
1645 "movq %%mm7, 112(%1) \n\t" | |
1646 | |
1647 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) | |
1648 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) | |
1649 | |
1650 "movq 112(%1), %%mm7 \n\t" | |
1651 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) | |
1652 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) | |
1653 | |
1654 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) | |
1655 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) | |
1656 | |
1657 HADAMARD48 | |
1658 | |
1659 "movq %%mm7, 120(%1) \n\t" | |
1660 | |
1661 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) | |
1662 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2) | |
1663 | |
1664 "movq 120(%1), %%mm7 \n\t" | |
1665 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) | |
1666 "movq %%mm7, %%mm5 \n\t"//FIXME remove | |
1667 "movq %%mm6, %%mm7 \n\t" | |
1668 "movq %%mm0, %%mm6 \n\t" | |
1669 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove | |
1670 | |
1671 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3) | |
1672 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) | |
1673 | |
1674 HADAMARD48 | |
1675 "movq %%mm7, 64(%1) \n\t" | |
1676 MMABS_MMX2(%%mm0, %%mm7) | |
1677 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) | |
1678 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0) | |
1679 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0) | |
1680 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0) | |
1681 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0) | |
1682 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0) | |
1683 "movq 64(%1), %%mm1 \n\t" | |
1684 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) | |
1685 "movq %%mm0, 64(%1) \n\t" | |
1686 | |
1687 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) | |
1688 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7) | |
1689 | |
1690 HADAMARD48 | |
1691 "movq %%mm7, (%1) \n\t" | |
1692 MMABS_MMX2(%%mm0, %%mm7) | |
1693 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) | |
1694 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0) | |
1695 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0) | |
1696 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0) | |
1697 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0) | |
1698 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0) | |
1699 "movq (%1), %%mm1 \n\t" | |
1700 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) | |
1701 "movq 64(%1), %%mm1 \n\t" | |
1702 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) | |
1703 | |
2892
41315d0120b3
replace a few mov + psrlq with pshufw, there are more cases which could benefit from this but they would require us to duplicate some functions ...
michael
parents:
2871
diff
changeset
|
1704 "pshufw $0x0E, %%mm0, %%mm1 \n\t" |
1153 | 1705 "paddusw %%mm1, %%mm0 \n\t" |
2892
41315d0120b3
replace a few mov + psrlq with pshufw, there are more cases which could benefit from this but they would require us to duplicate some functions ...
michael
parents:
2871
diff
changeset
|
1706 "pshufw $0x01, %%mm0, %%mm1 \n\t" |
1153 | 1707 "paddusw %%mm1, %%mm0 \n\t" |
1708 "movd %%mm0, %0 \n\t" | |
1709 | |
1710 : "=r" (sum) | |
1711 : "r"(temp) | |
1712 ); | |
1713 return sum&0xFFFF; | |
1714 } | |
1715 | |
1716 | |
1708 | 1717 WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx) |
1718 WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2) | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
1719 #endif //CONFIG_ENCODERS |
866 | 1720 |
959 | 1721 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d) |
1722 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d) | |
1723 | |
954 | 1724 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ |
1725 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ | |
961 | 1726 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ |
954 | 1727 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ |
1728 "movq "#in7", " #m3 " \n\t" /* d */\ | |
1729 "movq "#in0", %%mm5 \n\t" /* D */\ | |
1730 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\ | |
1731 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\ | |
1732 "movq "#in1", %%mm5 \n\t" /* C */\ | |
1733 "movq "#in2", %%mm6 \n\t" /* B */\ | |
1734 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\ | |
1735 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ | |
1736 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ | |
1737 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ | |
961 | 1738 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ |
954 | 1739 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ |
1740 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ | |
1741 "psraw $5, %%mm5 \n\t"\ | |
1742 "packuswb %%mm5, %%mm5 \n\t"\ | |
1743 OP(%%mm5, out, %%mm7, d) | |
1744 | |
959 | 1745 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\ |
1057 | 1746 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
954 | 1747 uint64_t temp;\ |
1748 \ | |
1749 asm volatile(\ | |
1750 "pxor %%mm7, %%mm7 \n\t"\ | |
1751 "1: \n\t"\ | |
1752 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ | |
1753 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ | |
1754 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ | |
1755 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ | |
1756 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ | |
1757 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ | |
1758 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ | |
1759 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ | |
1760 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ | |
1761 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ | |
1762 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ | |
1763 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ | |
1764 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ | |
1765 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ | |
1766 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ | |
1767 "paddw %%mm3, %%mm5 \n\t" /* b */\ | |
1768 "paddw %%mm2, %%mm6 \n\t" /* c */\ | |
1769 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
1770 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ | |
1771 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ | |
961 | 1772 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ |
954 | 1773 "paddw %%mm4, %%mm0 \n\t" /* a */\ |
1774 "paddw %%mm1, %%mm5 \n\t" /* d */\ | |
961 | 1775 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ |
954 | 1776 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ |
961 | 1777 "paddw %6, %%mm6 \n\t"\ |
954 | 1778 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
1779 "psraw $5, %%mm0 \n\t"\ | |
961 | 1780 "movq %%mm0, %5 \n\t"\ |
954 | 1781 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ |
1782 \ | |
1783 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ | |
1784 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\ | |
1785 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\ | |
1786 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\ | |
1787 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\ | |
1788 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\ | |
1789 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\ | |
1790 "paddw %%mm0, %%mm2 \n\t" /* b */\ | |
1791 "paddw %%mm5, %%mm3 \n\t" /* c */\ | |
1792 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ | |
1793 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ | |
1794 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\ | |
1795 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ | |
1796 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ | |
1797 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ | |
961 | 1798 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ |
954 | 1799 "paddw %%mm2, %%mm1 \n\t" /* a */\ |
1800 "paddw %%mm6, %%mm4 \n\t" /* d */\ | |
961 | 1801 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ |
954 | 1802 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ |
961 | 1803 "paddw %6, %%mm1 \n\t"\ |
954 | 1804 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ |
1805 "psraw $5, %%mm3 \n\t"\ | |
961 | 1806 "movq %5, %%mm1 \n\t"\ |
954 | 1807 "packuswb %%mm3, %%mm1 \n\t"\ |
959 | 1808 OP_MMX2(%%mm1, (%1),%%mm4, q)\ |
954 | 1809 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ |
1810 \ | |
1811 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ | |
1812 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ | |
1813 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ | |
1814 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\ | |
1815 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\ | |
1816 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\ | |
1817 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\ | |
1818 "paddw %%mm1, %%mm5 \n\t" /* b */\ | |
1819 "paddw %%mm4, %%mm0 \n\t" /* c */\ | |
1820 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
1821 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ | |
1822 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ | |
1823 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ | |
961 | 1824 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ |
954 | 1825 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ |
1826 "paddw %%mm3, %%mm2 \n\t" /* d */\ | |
1827 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ | |
1828 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\ | |
1829 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ | |
1830 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ | |
1831 "paddw %%mm2, %%mm6 \n\t" /* a */\ | |
961 | 1832 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ |
1833 "paddw %6, %%mm0 \n\t"\ | |
954 | 1834 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
1835 "psraw $5, %%mm0 \n\t"\ | |
1836 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ | |
1837 \ | |
1838 "paddw %%mm5, %%mm3 \n\t" /* a */\ | |
1839 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\ | |
1840 "paddw %%mm4, %%mm6 \n\t" /* b */\ | |
1841 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\ | |
1842 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\ | |
1843 "paddw %%mm1, %%mm4 \n\t" /* c */\ | |
1844 "paddw %%mm2, %%mm5 \n\t" /* d */\ | |
1845 "paddw %%mm6, %%mm6 \n\t" /* 2b */\ | |
1846 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ | |
961 | 1847 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ |
1848 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ | |
954 | 1849 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ |
961 | 1850 "paddw %6, %%mm4 \n\t"\ |
954 | 1851 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ |
1852 "psraw $5, %%mm4 \n\t"\ | |
1853 "packuswb %%mm4, %%mm0 \n\t"\ | |
959 | 1854 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ |
954 | 1855 \ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1856 "add %3, %0 \n\t"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1857 "add %4, %1 \n\t"\ |
954 | 1858 "decl %2 \n\t"\ |
1859 " jnz 1b \n\t"\ | |
967 | 1860 : "+a"(src), "+c"(dst), "+m"(h)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1861 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
966 | 1862 : "memory"\ |
954 | 1863 );\ |
1864 }\ | |
1865 \ | |
1866 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
1867 int i;\ | |
1868 int16_t temp[16];\ | |
1869 /* quick HACK, XXX FIXME MUST be optimized */\ | |
1870 for(i=0; i<h; i++)\ | |
1871 {\ | |
1872 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ | |
1873 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ | |
1874 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ | |
1875 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ | |
1876 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ | |
1877 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\ | |
1878 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\ | |
1879 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\ | |
1880 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\ | |
1881 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\ | |
1882 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\ | |
1883 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\ | |
1884 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\ | |
1885 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\ | |
1886 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\ | |
1887 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\ | |
1888 asm volatile(\ | |
1889 "movq (%0), %%mm0 \n\t"\ | |
1890 "movq 8(%0), %%mm1 \n\t"\ | |
1891 "paddw %2, %%mm0 \n\t"\ | |
1892 "paddw %2, %%mm1 \n\t"\ | |
1893 "psraw $5, %%mm0 \n\t"\ | |
1894 "psraw $5, %%mm1 \n\t"\ | |
1895 "packuswb %%mm1, %%mm0 \n\t"\ | |
959 | 1896 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ |
954 | 1897 "movq 16(%0), %%mm0 \n\t"\ |
1898 "movq 24(%0), %%mm1 \n\t"\ | |
1899 "paddw %2, %%mm0 \n\t"\ | |
1900 "paddw %2, %%mm1 \n\t"\ | |
1901 "psraw $5, %%mm0 \n\t"\ | |
1902 "psraw $5, %%mm1 \n\t"\ | |
1903 "packuswb %%mm1, %%mm0 \n\t"\ | |
959 | 1904 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\ |
954 | 1905 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ |
966 | 1906 : "memory"\ |
954 | 1907 );\ |
1908 dst+=dstStride;\ | |
1909 src+=srcStride;\ | |
1910 }\ | |
1911 }\ | |
1912 \ | |
1057 | 1913 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
959 | 1914 uint64_t temp;\ |
1915 \ | |
1916 asm volatile(\ | |
1917 "pxor %%mm7, %%mm7 \n\t"\ | |
1918 "1: \n\t"\ | |
1919 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ | |
1920 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ | |
1921 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ | |
1922 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ | |
1923 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ | |
1924 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ | |
1925 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ | |
1926 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ | |
1927 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ | |
1928 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ | |
1929 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ | |
1930 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ | |
1931 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ | |
1932 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ | |
1933 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ | |
1934 "paddw %%mm3, %%mm5 \n\t" /* b */\ | |
1935 "paddw %%mm2, %%mm6 \n\t" /* c */\ | |
1936 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
1937 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ | |
1938 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ | |
961 | 1939 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ |
959 | 1940 "paddw %%mm4, %%mm0 \n\t" /* a */\ |
1941 "paddw %%mm1, %%mm5 \n\t" /* d */\ | |
961 | 1942 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ |
959 | 1943 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ |
961 | 1944 "paddw %6, %%mm6 \n\t"\ |
959 | 1945 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
1946 "psraw $5, %%mm0 \n\t"\ | |
1947 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ | |
1948 \ | |
1949 "movd 5(%0), %%mm5 \n\t" /* FGHI */\ | |
1950 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\ | |
1951 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\ | |
1952 "paddw %%mm5, %%mm1 \n\t" /* a */\ | |
1953 "paddw %%mm6, %%mm2 \n\t" /* b */\ | |
1954 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\ | |
1955 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\ | |
1956 "paddw %%mm6, %%mm3 \n\t" /* c */\ | |
1957 "paddw %%mm5, %%mm4 \n\t" /* d */\ | |
1958 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ | |
1959 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ | |
961 | 1960 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ |
1961 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ | |
959 | 1962 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ |
961 | 1963 "paddw %6, %%mm1 \n\t"\ |
959 | 1964 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ |
1965 "psraw $5, %%mm3 \n\t"\ | |
1966 "packuswb %%mm3, %%mm0 \n\t"\ | |
1967 OP_MMX2(%%mm0, (%1), %%mm4, q)\ | |
1968 \ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1969 "add %3, %0 \n\t"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1970 "add %4, %1 \n\t"\ |
959 | 1971 "decl %2 \n\t"\ |
961 | 1972 " jnz 1b \n\t"\ |
967 | 1973 : "+a"(src), "+c"(dst), "+m"(h)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1974 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
966 | 1975 : "memory"\ |
959 | 1976 );\ |
1977 }\ | |
1978 \ | |
1979 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
1980 int i;\ | |
1981 int16_t temp[8];\ | |
1982 /* quick HACK, XXX FIXME MUST be optimized */\ | |
1983 for(i=0; i<h; i++)\ | |
1984 {\ | |
1985 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ | |
1986 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ | |
1987 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ | |
1988 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ | |
1989 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ | |
1990 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\ | |
1991 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\ | |
1992 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\ | |
1993 asm volatile(\ | |
1994 "movq (%0), %%mm0 \n\t"\ | |
1995 "movq 8(%0), %%mm1 \n\t"\ | |
1996 "paddw %2, %%mm0 \n\t"\ | |
1997 "paddw %2, %%mm1 \n\t"\ | |
1998 "psraw $5, %%mm0 \n\t"\ | |
1999 "psraw $5, %%mm1 \n\t"\ | |
2000 "packuswb %%mm1, %%mm0 \n\t"\ | |
2001 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ | |
2002 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ | |
966 | 2003 :"memory"\ |
959 | 2004 );\ |
2005 dst+=dstStride;\ | |
2006 src+=srcStride;\ | |
2007 }\ | |
2008 } | |
2009 | |
2010 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ | |
2011 \ | |
2012 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
954 | 2013 uint64_t temp[17*4];\ |
2014 uint64_t *temp_ptr= temp;\ | |
2015 int count= 17;\ | |
2016 \ | |
2017 /*FIXME unroll */\ | |
2018 asm volatile(\ | |
2019 "pxor %%mm7, %%mm7 \n\t"\ | |
2020 "1: \n\t"\ | |
2021 "movq (%0), %%mm0 \n\t"\ | |
2022 "movq (%0), %%mm1 \n\t"\ | |
2023 "movq 8(%0), %%mm2 \n\t"\ | |
2024 "movq 8(%0), %%mm3 \n\t"\ | |
2025 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2026 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
2027 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
2028 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
2029 "movq %%mm0, (%1) \n\t"\ | |
2030 "movq %%mm1, 17*8(%1) \n\t"\ | |
967 | 2031 "movq %%mm2, 2*17*8(%1) \n\t"\ |
2032 "movq %%mm3, 3*17*8(%1) \n\t"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2033 "add $8, %1 \n\t"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2034 "add %3, %0 \n\t"\ |
954 | 2035 "decl %2 \n\t"\ |
2036 " jnz 1b \n\t"\ | |
2037 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2038 : "r" ((long)srcStride)\ |
966 | 2039 : "memory"\ |
954 | 2040 );\ |
2041 \ | |
2042 temp_ptr= temp;\ | |
2043 count=4;\ | |
2044 \ | |
2045 /*FIXME reorder for speed */\ | |
2046 asm volatile(\ | |
2047 /*"pxor %%mm7, %%mm7 \n\t"*/\ | |
2048 "1: \n\t"\ | |
2049 "movq (%0), %%mm0 \n\t"\ | |
2050 "movq 8(%0), %%mm1 \n\t"\ | |
2051 "movq 16(%0), %%mm2 \n\t"\ | |
2052 "movq 24(%0), %%mm3 \n\t"\ | |
961 | 2053 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
2054 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2055 "add %4, %1 \n\t"\ |
961 | 2056 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
954 | 2057 \ |
961 | 2058 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2059 "add %4, %1 \n\t"\ |
961 | 2060 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
2061 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2062 "add %4, %1 \n\t"\ |
961 | 2063 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ |
2064 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2065 "add %4, %1 \n\t"\ |
961 | 2066 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ |
2067 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2068 "add %4, %1 \n\t"\ |
961 | 2069 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ |
2070 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2071 "add %4, %1 \n\t"\ |
961 | 2072 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ |
954 | 2073 \ |
961 | 2074 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2075 "add %4, %1 \n\t" \ |
961 | 2076 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ |
2077 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ | |
954 | 2078 \ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2079 "add $136, %0 \n\t"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2080 "add %6, %1 \n\t"\ |
954 | 2081 "decl %2 \n\t"\ |
2082 " jnz 1b \n\t"\ | |
958
9bb668034ecf
slowdown / gcc 2.95.* bug workaround (this should be reversed as soon as gcc 2.95.* support is droped)
michaelni
parents:
954
diff
changeset
|
2083 \ |
967 | 2084 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2085 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\ |
966 | 2086 :"memory"\ |
954 | 2087 );\ |
2088 }\ | |
2089 \ | |
1057 | 2090 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2209 | 2091 uint64_t temp[9*2];\ |
954 | 2092 uint64_t *temp_ptr= temp;\ |
2093 int count= 9;\ | |
2094 \ | |
2095 /*FIXME unroll */\ | |
2096 asm volatile(\ | |
2097 "pxor %%mm7, %%mm7 \n\t"\ | |
2098 "1: \n\t"\ | |
2099 "movq (%0), %%mm0 \n\t"\ | |
2100 "movq (%0), %%mm1 \n\t"\ | |
2101 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2102 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
2103 "movq %%mm0, (%1) \n\t"\ | |
2104 "movq %%mm1, 9*8(%1) \n\t"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2105 "add $8, %1 \n\t"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2106 "add %3, %0 \n\t"\ |
954 | 2107 "decl %2 \n\t"\ |
2108 " jnz 1b \n\t"\ | |
2109 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2110 : "r" ((long)srcStride)\ |
966 | 2111 : "memory"\ |
954 | 2112 );\ |
2113 \ | |
2114 temp_ptr= temp;\ | |
2115 count=2;\ | |
2116 \ | |
2117 /*FIXME reorder for speed */\ | |
2118 asm volatile(\ | |
2119 /*"pxor %%mm7, %%mm7 \n\t"*/\ | |
2120 "1: \n\t"\ | |
2121 "movq (%0), %%mm0 \n\t"\ | |
2122 "movq 8(%0), %%mm1 \n\t"\ | |
2123 "movq 16(%0), %%mm2 \n\t"\ | |
2124 "movq 24(%0), %%mm3 \n\t"\ | |
961 | 2125 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
2126 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2127 "add %4, %1 \n\t"\ |
961 | 2128 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
954 | 2129 \ |
961 | 2130 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2131 "add %4, %1 \n\t"\ |
961 | 2132 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
954 | 2133 \ |
961 | 2134 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2135 "add %4, %1 \n\t"\ |
961 | 2136 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ |
2137 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ | |
954 | 2138 \ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2139 "add $72, %0 \n\t"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2140 "add %6, %1 \n\t"\ |
954 | 2141 "decl %2 \n\t"\ |
2142 " jnz 1b \n\t"\ | |
2143 \ | |
961 | 2144 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2145 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\ |
966 | 2146 : "memory"\ |
2147 );\ | |
959 | 2148 }\ |
954 | 2149 \ |
1064 | 2150 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ |
959 | 2151 OPNAME ## pixels8_mmx(dst, src, stride, 8);\ |
954 | 2152 }\ |
2153 \ | |
1064 | 2154 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2155 uint64_t temp[8];\ |
954 | 2156 uint8_t * const half= (uint8_t*)temp;\ |
2157 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2158 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ |
954 | 2159 }\ |
2160 \ | |
1064 | 2161 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2162 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ |
2163 }\ | |
2164 \ | |
1064 | 2165 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2166 uint64_t temp[8];\ |
954 | 2167 uint8_t * const half= (uint8_t*)temp;\ |
2168 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2169 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\ |
954 | 2170 }\ |
2171 \ | |
1064 | 2172 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2173 uint64_t temp[8];\ |
954 | 2174 uint8_t * const half= (uint8_t*)temp;\ |
959 | 2175 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2176 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ |
954 | 2177 }\ |
2178 \ | |
1064 | 2179 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
959 | 2180 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ |
954 | 2181 }\ |
2182 \ | |
1064 | 2183 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2184 uint64_t temp[8];\ |
954 | 2185 uint8_t * const half= (uint8_t*)temp;\ |
959 | 2186 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2187 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\ |
954 | 2188 }\ |
1064 | 2189 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2190 uint64_t half[8 + 9];\ |
2191 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
2192 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2193 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2194 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ |
959 | 2195 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2196 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ |
954 | 2197 }\ |
1064 | 2198 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2199 uint64_t half[8 + 9];\ |
2200 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
2201 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2202 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2203 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ |
959 | 2204 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2205 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ |
954 | 2206 }\ |
1064 | 2207 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2208 uint64_t half[8 + 9];\ |
2209 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
2210 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2211 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2212 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ |
959 | 2213 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2214 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ |
954 | 2215 }\ |
1064 | 2216 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2217 uint64_t half[8 + 9];\ |
2218 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
2219 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2220 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2221 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ |
959 | 2222 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2223 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ |
954 | 2224 }\ |
1064 | 2225 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2226 uint64_t half[8 + 9];\ |
954 | 2227 uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
2228 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2229 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
959 | 2230 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2231 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ |
954 | 2232 }\ |
1064 | 2233 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2234 uint64_t half[8 + 9];\ |
954 | 2235 uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
2236 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2237 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
959 | 2238 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2239 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ |
954 | 2240 }\ |
1064 | 2241 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2242 uint64_t half[8 + 9];\ |
2243 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 2244 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2245 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ |
984 | 2246 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
954 | 2247 }\ |
1064 | 2248 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2249 uint64_t half[8 + 9];\ |
2250 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 2251 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2252 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ |
984 | 2253 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
954 | 2254 }\ |
1064 | 2255 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2256 uint64_t half[9];\ |
954 | 2257 uint8_t * const halfH= ((uint8_t*)half);\ |
2258 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
959 | 2259 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
954 | 2260 }\ |
1064 | 2261 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ |
959 | 2262 OPNAME ## pixels16_mmx(dst, src, stride, 16);\ |
954 | 2263 }\ |
2264 \ | |
1064 | 2265 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2266 uint64_t temp[32];\ |
2267 uint8_t * const half= (uint8_t*)temp;\ | |
2268 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2269 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ |
954 | 2270 }\ |
2271 \ | |
1064 | 2272 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2273 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ |
2274 }\ | |
2275 \ | |
1064 | 2276 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2277 uint64_t temp[32];\ |
2278 uint8_t * const half= (uint8_t*)temp;\ | |
2279 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2280 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\ |
954 | 2281 }\ |
2282 \ | |
1064 | 2283 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2284 uint64_t temp[32];\ |
2285 uint8_t * const half= (uint8_t*)temp;\ | |
959 | 2286 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2287 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ |
954 | 2288 }\ |
2289 \ | |
1064 | 2290 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
959 | 2291 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ |
954 | 2292 }\ |
2293 \ | |
1064 | 2294 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2295 uint64_t temp[32];\ |
2296 uint8_t * const half= (uint8_t*)temp;\ | |
959 | 2297 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2298 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\ |
954 | 2299 }\ |
1064 | 2300 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2301 uint64_t half[16*2 + 17*2];\ |
2302 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2303 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2304 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2305 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ |
959 | 2306 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2307 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ |
954 | 2308 }\ |
1064 | 2309 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2310 uint64_t half[16*2 + 17*2];\ |
2311 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2312 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2313 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2314 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ |
959 | 2315 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2316 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ |
954 | 2317 }\ |
1064 | 2318 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2319 uint64_t half[16*2 + 17*2];\ |
2320 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2321 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2322 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2323 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ |
959 | 2324 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2325 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ |
954 | 2326 }\ |
1064 | 2327 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2328 uint64_t half[16*2 + 17*2];\ |
2329 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2330 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2331 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2332 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ |
959 | 2333 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2334 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ |
954 | 2335 }\ |
1064 | 2336 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2337 uint64_t half[16*2 + 17*2];\ |
2338 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2339 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2340 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
959 | 2341 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2342 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ |
954 | 2343 }\ |
1064 | 2344 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2345 uint64_t half[16*2 + 17*2];\ |
2346 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2347 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2348 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
959 | 2349 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2350 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ |
954 | 2351 }\ |
1064 | 2352 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2353 uint64_t half[17*2];\ |
2354 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 2355 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2356 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ |
984 | 2357 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
954 | 2358 }\ |
1064 | 2359 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2360 uint64_t half[17*2];\ |
2361 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 2362 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2363 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ |
984 | 2364 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
954 | 2365 }\ |
1064 | 2366 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2367 uint64_t half[17*2];\ |
2368 uint8_t * const halfH= ((uint8_t*)half);\ | |
2369 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
959 | 2370 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
954 | 2371 } |
2372 | |
2373 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" | |
959 | 2374 #define AVG_3DNOW_OP(a,b,temp, size) \ |
954 | 2375 "mov" #size " " #b ", " #temp " \n\t"\ |
2376 "pavgusb " #temp ", " #a " \n\t"\ | |
2377 "mov" #size " " #a ", " #b " \n\t" | |
959 | 2378 #define AVG_MMX2_OP(a,b,temp, size) \ |
954 | 2379 "mov" #size " " #b ", " #temp " \n\t"\ |
2380 "pavgb " #temp ", " #a " \n\t"\ | |
2381 "mov" #size " " #a ", " #b " \n\t" | |
959 | 2382 |
2383 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP) | |
2384 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP) | |
2385 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) | |
2386 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow) | |
2387 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) | |
2388 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) | |
954 | 2389 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) |
959 | 2390 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) |
954 | 2391 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) |
2392 | |
393 | 2393 #if 0 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2394 static void just_return() { return; } |
393 | 2395 #endif |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2396 |
954 | 2397 #define SET_QPEL_FUNC(postfix1, postfix2) \ |
2398 c->put_ ## postfix1 = put_ ## postfix2;\ | |
2399 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\ | |
2400 c->avg_ ## postfix1 = avg_ ## postfix2; | |
1092 | 2401 |
1784 | 2402 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2403 long i=0; |
1784 | 2404 |
2405 assert(ABS(scale) < 256); | |
2406 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; | |
2407 | |
2408 asm volatile( | |
2409 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w | |
2410 "psrlw $15, %%mm6 \n\t" // 1w | |
2411 "pxor %%mm7, %%mm7 \n\t" | |
2412 "movd %4, %%mm5 \n\t" | |
2413 "punpcklwd %%mm5, %%mm5 \n\t" | |
2414 "punpcklwd %%mm5, %%mm5 \n\t" | |
2415 "1: \n\t" | |
2416 "movq (%1, %0), %%mm0 \n\t" | |
2417 "movq 8(%1, %0), %%mm1 \n\t" | |
2418 "pmulhw %%mm5, %%mm0 \n\t" | |
2419 "pmulhw %%mm5, %%mm1 \n\t" | |
2420 "paddw %%mm6, %%mm0 \n\t" | |
2421 "paddw %%mm6, %%mm1 \n\t" | |
2422 "psraw $1, %%mm0 \n\t" | |
2423 "psraw $1, %%mm1 \n\t" | |
2424 "paddw (%2, %0), %%mm0 \n\t" | |
2425 "paddw 8(%2, %0), %%mm1 \n\t" | |
2426 "psraw $6, %%mm0 \n\t" | |
2427 "psraw $6, %%mm1 \n\t" | |
2428 "pmullw (%3, %0), %%mm0 \n\t" | |
2429 "pmullw 8(%3, %0), %%mm1 \n\t" | |
2430 "pmaddwd %%mm0, %%mm0 \n\t" | |
2431 "pmaddwd %%mm1, %%mm1 \n\t" | |
2432 "paddd %%mm1, %%mm0 \n\t" | |
2433 "psrld $4, %%mm0 \n\t" | |
2434 "paddd %%mm0, %%mm7 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2435 "add $16, %0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2436 "cmp $128, %0 \n\t" //FIXME optimize & bench |
1784 | 2437 " jb 1b \n\t" |
2438 "movq %%mm7, %%mm6 \n\t" | |
2439 "psrlq $32, %%mm7 \n\t" | |
2440 "paddd %%mm6, %%mm7 \n\t" | |
2441 "psrld $2, %%mm7 \n\t" | |
2442 "movd %%mm7, %0 \n\t" | |
2443 | |
2444 : "+r" (i) | |
2445 : "r"(basis), "r"(rem), "r"(weight), "g"(scale) | |
2446 ); | |
2447 return i; | |
2448 } | |
2449 | |
2450 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2451 long i=0; |
1784 | 2452 |
2453 if(ABS(scale) < 256){ | |
2454 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; | |
2455 asm volatile( | |
2456 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w | |
2457 "psrlw $15, %%mm6 \n\t" // 1w | |
2458 "movd %3, %%mm5 \n\t" | |
2459 "punpcklwd %%mm5, %%mm5 \n\t" | |
2460 "punpcklwd %%mm5, %%mm5 \n\t" | |
2461 "1: \n\t" | |
2462 "movq (%1, %0), %%mm0 \n\t" | |
2463 "movq 8(%1, %0), %%mm1 \n\t" | |
2464 "pmulhw %%mm5, %%mm0 \n\t" | |
2465 "pmulhw %%mm5, %%mm1 \n\t" | |
2466 "paddw %%mm6, %%mm0 \n\t" | |
2467 "paddw %%mm6, %%mm1 \n\t" | |
2468 "psraw $1, %%mm0 \n\t" | |
2469 "psraw $1, %%mm1 \n\t" | |
2470 "paddw (%2, %0), %%mm0 \n\t" | |
2471 "paddw 8(%2, %0), %%mm1 \n\t" | |
2472 "movq %%mm0, (%2, %0) \n\t" | |
2473 "movq %%mm1, 8(%2, %0) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2474 "add $16, %0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2475 "cmp $128, %0 \n\t" //FIXME optimize & bench |
1784 | 2476 " jb 1b \n\t" |
2477 | |
2478 : "+r" (i) | |
2479 : "r"(basis), "r"(rem), "g"(scale) | |
2480 ); | |
2481 }else{ | |
2482 for(i=0; i<8*8; i++){ | |
2483 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); | |
2484 } | |
2485 } | |
2486 } | |
2754 | 2487 |
2488 #include "h264dsp_mmx.c" | |
1784 | 2489 |
1092 | 2490 /* external functions, from idct_mmx.c */ |
2491 void ff_mmx_idct(DCTELEM *block); | |
2492 void ff_mmxext_idct(DCTELEM *block); | |
2493 | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2494 void ff_vp3_idct_sse2(int16_t *input_data); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2495 void ff_vp3_idct_mmx(int16_t *data); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2496 void ff_vp3_dsp_init_mmx(void); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2497 |
1092 | 2498 /* XXX: those functions should be suppressed ASAP when all IDCTs are |
2499 converted */ | |
2500 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) | |
2501 { | |
2502 ff_mmx_idct (block); | |
2503 put_pixels_clamped_mmx(block, dest, line_size); | |
2504 } | |
2505 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
2506 { | |
2507 ff_mmx_idct (block); | |
2508 add_pixels_clamped_mmx(block, dest, line_size); | |
2509 } | |
2510 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block) | |
2511 { | |
2512 ff_mmxext_idct (block); | |
2513 put_pixels_clamped_mmx(block, dest, line_size); | |
2514 } | |
2515 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
2516 { | |
2517 ff_mmxext_idct (block); | |
2518 add_pixels_clamped_mmx(block, dest, line_size); | |
2519 } | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2520 static void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block) |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2521 { |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2522 ff_vp3_idct_sse2(block); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2523 put_signed_pixels_clamped_mmx(block, dest, line_size); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2524 } |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2525 static void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block) |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2526 { |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2527 ff_vp3_idct_sse2(block); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2528 add_pixels_clamped_mmx(block, dest, line_size); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2529 } |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2530 static void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2531 { |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2532 ff_vp3_idct_mmx(block); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2533 put_signed_pixels_clamped_mmx(block, dest, line_size); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2534 } |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2535 static void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2536 { |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2537 ff_vp3_idct_mmx(block); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2538 add_pixels_clamped_mmx(block, dest, line_size); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2539 } |
2868 | 2540 #ifdef CONFIG_GPL |
2541 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) | |
2542 { | |
2543 ff_idct_xvid_mmx (block); | |
2544 put_pixels_clamped_mmx(block, dest, line_size); | |
2545 } | |
2546 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block) | |
2547 { | |
2548 ff_idct_xvid_mmx (block); | |
2549 add_pixels_clamped_mmx(block, dest, line_size); | |
2550 } | |
2551 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block) | |
2552 { | |
2553 ff_idct_xvid_mmx2 (block); | |
2554 put_pixels_clamped_mmx(block, dest, line_size); | |
2555 } | |
2556 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) | |
2557 { | |
2558 ff_idct_xvid_mmx2 (block); | |
2559 add_pixels_clamped_mmx(block, dest, line_size); | |
2560 } | |
2561 #endif | |
954 | 2562 |
1092 | 2563 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
0 | 2564 { |
2565 mm_flags = mm_support(); | |
1115 | 2566 |
1122 | 2567 if (avctx->dsp_mask) { |
2568 if (avctx->dsp_mask & FF_MM_FORCE) | |
2569 mm_flags |= (avctx->dsp_mask & 0xffff); | |
2570 else | |
2571 mm_flags &= ~(avctx->dsp_mask & 0xffff); | |
2572 } | |
1115 | 2573 |
631
47a8964ba5cd
be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents:
629
diff
changeset
|
2574 #if 0 |
1868 | 2575 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:"); |
0 | 2576 if (mm_flags & MM_MMX) |
1868 | 2577 av_log(avctx, AV_LOG_INFO, " mmx"); |
0 | 2578 if (mm_flags & MM_MMXEXT) |
1868 | 2579 av_log(avctx, AV_LOG_INFO, " mmxext"); |
0 | 2580 if (mm_flags & MM_3DNOW) |
1868 | 2581 av_log(avctx, AV_LOG_INFO, " 3dnow"); |
0 | 2582 if (mm_flags & MM_SSE) |
1868 | 2583 av_log(avctx, AV_LOG_INFO, " sse"); |
0 | 2584 if (mm_flags & MM_SSE2) |
1868 | 2585 av_log(avctx, AV_LOG_INFO, " sse2"); |
2586 av_log(avctx, AV_LOG_INFO, "\n"); | |
0 | 2587 #endif |
2588 | |
2589 if (mm_flags & MM_MMX) { | |
1092 | 2590 const int idct_algo= avctx->idct_algo; |
2591 | |
1232
e88d3b1fb2a1
more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents:
1186
diff
changeset
|
2592 #ifdef CONFIG_ENCODERS |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1985
diff
changeset
|
2593 const int dct_algo = avctx->dct_algo; |
1565 | 2594 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
2595 if(mm_flags & MM_SSE2){ |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
2596 c->fdct = ff_fdct_sse2; |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
2597 }else if(mm_flags & MM_MMXEXT){ |
1565 | 2598 c->fdct = ff_fdct_mmx2; |
2599 }else{ | |
2600 c->fdct = ff_fdct_mmx; | |
2601 } | |
2602 } | |
1232
e88d3b1fb2a1
more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents:
1186
diff
changeset
|
2603 #endif //CONFIG_ENCODERS |
2256 | 2604 if(avctx->lowres==0){ |
2605 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ | |
2606 c->idct_put= ff_simple_idct_put_mmx; | |
2607 c->idct_add= ff_simple_idct_add_mmx; | |
2608 c->idct = ff_simple_idct_mmx; | |
2609 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; | |
2610 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ | |
2611 if(mm_flags & MM_MMXEXT){ | |
2612 c->idct_put= ff_libmpeg2mmx2_idct_put; | |
2613 c->idct_add= ff_libmpeg2mmx2_idct_add; | |
2614 c->idct = ff_mmxext_idct; | |
2615 }else{ | |
2616 c->idct_put= ff_libmpeg2mmx_idct_put; | |
2617 c->idct_add= ff_libmpeg2mmx_idct_add; | |
2618 c->idct = ff_mmx_idct; | |
2619 } | |
2620 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2621 }else if(idct_algo==FF_IDCT_VP3){ |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2622 if(mm_flags & MM_SSE2){ |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2623 c->idct_put= ff_vp3_idct_put_sse2; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2624 c->idct_add= ff_vp3_idct_add_sse2; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2625 c->idct = ff_vp3_idct_sse2; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2626 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2627 }else{ |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2628 ff_vp3_dsp_init_mmx(); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2629 c->idct_put= ff_vp3_idct_put_mmx; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2630 c->idct_add= ff_vp3_idct_add_mmx; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2631 c->idct = ff_vp3_idct_mmx; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2632 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2633 } |
2868 | 2634 #ifdef CONFIG_GPL |
2635 }else if(idct_algo==FF_IDCT_XVIDMMX){ | |
2636 if(mm_flags & MM_MMXEXT){ | |
2637 c->idct_put= ff_idct_xvid_mmx2_put; | |
2638 c->idct_add= ff_idct_xvid_mmx2_add; | |
2639 c->idct = ff_idct_xvid_mmx2; | |
2640 }else{ | |
2641 c->idct_put= ff_idct_xvid_mmx_put; | |
2642 c->idct_add= ff_idct_xvid_mmx_add; | |
2643 c->idct = ff_idct_xvid_mmx; | |
2644 } | |
2871
f4afa3812818
Fix compile without CONFIG_GPL, misplaced #endif caused a missing }.
reimar
parents:
2868
diff
changeset
|
2645 #endif |
1092 | 2646 } |
2647 } | |
1868 | 2648 |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2649 #ifdef CONFIG_ENCODERS |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2650 c->get_pixels = get_pixels_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2651 c->diff_pixels = diff_pixels_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2652 #endif //CONFIG_ENCODERS |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2653 c->put_pixels_clamped = put_pixels_clamped_mmx; |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
2654 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx; |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2655 c->add_pixels_clamped = add_pixels_clamped_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2656 c->clear_blocks = clear_blocks_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2657 #ifdef CONFIG_ENCODERS |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2658 c->pix_sum = pix_sum16_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2659 #endif //CONFIG_ENCODERS |
415 | 2660 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2661 c->put_pixels_tab[0][0] = put_pixels16_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2662 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2663 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2664 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx; |
0 | 2665 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2666 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2667 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2668 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2669 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx; |
651 | 2670 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2671 c->avg_pixels_tab[0][0] = avg_pixels16_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2672 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2673 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2674 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx; |
415 | 2675 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2676 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2677 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2678 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2679 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2680 |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2681 c->put_pixels_tab[1][0] = put_pixels8_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2682 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2683 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2684 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx; |
0 | 2685 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2686 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2687 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2688 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2689 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx; |
651 | 2690 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2691 c->avg_pixels_tab[1][0] = avg_pixels8_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2692 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2693 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2694 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx; |
651 | 2695 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2696 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2697 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2698 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2699 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx; |
954 | 2700 |
866 | 2701 c->add_bytes= add_bytes_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2702 #ifdef CONFIG_ENCODERS |
866 | 2703 c->diff_bytes= diff_bytes_mmx; |
936 | 2704 |
2705 c->hadamard8_diff[0]= hadamard8_diff16_mmx; | |
2706 c->hadamard8_diff[1]= hadamard8_diff_mmx; | |
2707 | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
2708 c->pix_norm1 = pix_norm1_mmx; |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
2709 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx; |
2067 | 2710 c->sse[1] = sse8_mmx; |
1729 | 2711 c->vsad[4]= vsad_intra16_mmx; |
2712 | |
2067 | 2713 c->nsse[0] = nsse16_mmx; |
2714 c->nsse[1] = nsse8_mmx; | |
1729 | 2715 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
2716 c->vsad[0] = vsad16_mmx; | |
2717 } | |
1784 | 2718 |
2719 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
2720 c->try_8x8basis= try_8x8basis_mmx; | |
2721 } | |
2722 c->add_8x8basis= add_8x8basis_mmx; | |
2723 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2724 #endif //CONFIG_ENCODERS |
1647 | 2725 |
2726 c->h263_v_loop_filter= h263_v_loop_filter_mmx; | |
1784 | 2727 c->h263_h_loop_filter= h263_h_loop_filter_mmx; |
2732 | 2728 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx; |
936 | 2729 |
0 | 2730 if (mm_flags & MM_MMXEXT) { |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2731 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2732 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; |
651 | 2733 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2734 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2735 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2736 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; |
415 | 2737 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2738 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2739 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2; |
651 | 2740 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2741 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2742 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2743 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; |
1092 | 2744 |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2745 #ifdef CONFIG_ENCODERS |
1153 | 2746 c->hadamard8_diff[0]= hadamard8_diff16_mmx2; |
2747 c->hadamard8_diff[1]= hadamard8_diff_mmx2; | |
1729 | 2748 c->vsad[4]= vsad_intra16_mmx2; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2749 #endif //CONFIG_ENCODERS |
1153 | 2750 |
2745 | 2751 c->h264_idct_add= ff_h264_idct_add_mmx2; |
2752 | |
1092 | 2753 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
2754 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; | |
2755 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; | |
2756 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; | |
2757 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; | |
2758 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; | |
2759 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; | |
1772
8cd5257195c9
vsad16_mmx2 only applies if encoders are turned on
melanson
parents:
1765
diff
changeset
|
2760 #ifdef CONFIG_ENCODERS |
1729 | 2761 c->vsad[0] = vsad16_mmx2; |
1772
8cd5257195c9
vsad16_mmx2 only applies if encoders are turned on
melanson
parents:
1765
diff
changeset
|
2762 #endif //CONFIG_ENCODERS |
1092 | 2763 } |
959 | 2764 |
961 | 2765 #if 1 |
954 | 2766 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2) |
2767 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2) | |
2768 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2) | |
2769 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2) | |
2770 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2) | |
2771 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2) | |
2772 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2) | |
2773 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2) | |
2774 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2) | |
2775 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2) | |
2776 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2) | |
2777 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2) | |
2778 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2) | |
2779 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2) | |
2780 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2) | |
2781 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2) | |
2782 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2) | |
2783 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2) | |
2784 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2) | |
2785 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2) | |
2786 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2) | |
2787 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2) | |
2788 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2) | |
2789 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2) | |
2790 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2) | |
2791 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2) | |
2792 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2) | |
2793 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2) | |
2794 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2) | |
2795 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2) | |
2796 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2) | |
2797 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2) | |
961 | 2798 #endif |
1527 | 2799 |
2209 | 2800 //FIXME 3dnow too |
2801 #define dspfunc(PFX, IDX, NUM) \ | |
2802 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \ | |
2803 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \ | |
2804 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \ | |
2805 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \ | |
2806 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \ | |
2807 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \ | |
2808 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \ | |
2809 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \ | |
2810 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \ | |
2811 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \ | |
2812 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \ | |
2813 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \ | |
2814 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \ | |
2815 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \ | |
2816 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \ | |
2817 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2 | |
2818 | |
2819 dspfunc(put_h264_qpel, 0, 16); | |
2820 dspfunc(put_h264_qpel, 1, 8); | |
2821 dspfunc(put_h264_qpel, 2, 4); | |
2822 dspfunc(avg_h264_qpel, 0, 16); | |
2823 dspfunc(avg_h264_qpel, 1, 8); | |
2824 dspfunc(avg_h264_qpel, 2, 4); | |
2825 #undef dspfunc | |
2826 | |
2732 | 2827 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2; |
2633 | 2828 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; |
2829 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; | |
2830 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; | |
2831 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2832 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2833 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; |
2633 | 2834 |
2902
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2835 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2836 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2837 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2838 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2839 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2840 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2841 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2842 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2843 |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2844 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2845 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2846 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2847 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2848 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2849 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2850 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2851 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
2852 |
1686
68abbec33289
Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents:
1648
diff
changeset
|
2853 #ifdef CONFIG_ENCODERS |
1527 | 2854 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; |
1686
68abbec33289
Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents:
1648
diff
changeset
|
2855 #endif //CONFIG_ENCODERS |
0 | 2856 } else if (mm_flags & MM_3DNOW) { |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2857 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2858 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; |
393 | 2859 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2860 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2861 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2862 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; |
651 | 2863 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2864 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2865 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2866 |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2867 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2868 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2869 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; |
1092 | 2870 |
2871 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
2872 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; | |
2873 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; | |
2874 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; | |
2875 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; | |
2876 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; | |
2877 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; | |
2878 } | |
984 | 2879 |
954 | 2880 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow) |
2881 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow) | |
2882 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow) | |
2883 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow) | |
2884 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow) | |
2885 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow) | |
2886 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow) | |
2887 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow) | |
2888 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow) | |
2889 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow) | |
2890 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow) | |
2891 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow) | |
2892 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow) | |
2893 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow) | |
2894 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow) | |
2895 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow) | |
2896 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow) | |
2897 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow) | |
2898 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow) | |
2899 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow) | |
2900 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow) | |
2901 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow) | |
2902 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow) | |
2903 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow) | |
2904 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow) | |
2905 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow) | |
2906 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow) | |
2907 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow) | |
2908 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow) | |
2909 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow) | |
2910 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow) | |
2911 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow) | |
2209 | 2912 |
2913 #define dspfunc(PFX, IDX, NUM) \ | |
2914 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \ | |
2915 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \ | |
2916 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \ | |
2917 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \ | |
2918 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \ | |
2919 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \ | |
2920 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \ | |
2921 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \ | |
2922 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \ | |
2923 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \ | |
2924 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \ | |
2925 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \ | |
2926 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \ | |
2927 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \ | |
2928 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \ | |
2929 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow | |
2930 | |
2931 dspfunc(put_h264_qpel, 0, 16); | |
2932 dspfunc(put_h264_qpel, 1, 8); | |
2933 dspfunc(put_h264_qpel, 2, 4); | |
2934 dspfunc(avg_h264_qpel, 0, 16); | |
2935 dspfunc(avg_h264_qpel, 1, 8); | |
2936 dspfunc(avg_h264_qpel, 2, 4); | |
2732 | 2937 |
2938 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow; | |
0 | 2939 } |
2940 } | |
1092 | 2941 |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2942 #ifdef CONFIG_ENCODERS |
1092 | 2943 dsputil_init_pix_mmx(c, avctx); |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2944 #endif //CONFIG_ENCODERS |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2945 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2946 // for speed testing |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2947 get_pixels = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2948 put_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2949 add_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2950 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2951 pix_abs16x16 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2952 pix_abs16x16_x2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2953 pix_abs16x16_y2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2954 pix_abs16x16_xy2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2955 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2956 put_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2957 put_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2958 put_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2959 put_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2960 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2961 put_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2962 put_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2963 put_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2964 put_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2965 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2966 avg_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2967 avg_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2968 avg_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2969 avg_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2970 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2971 avg_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2972 avg_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2973 avg_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2974 avg_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2975 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2976 //av_fdct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2977 //ff_idct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2978 #endif |
0 | 2979 } |