Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 2892:41315d0120b3 libavcodec
replace a few mov + psrlq with pshufw, there are more cases which could benefit from this but they would require us to duplicate some functions ...
the trick is from various places (my own code in libpostproc, a patch on the x264 list, ...)
author | michael |
---|---|
date | Wed, 21 Sep 2005 21:17:09 +0000 |
parents | f4afa3812818 |
children | d3a726717baf |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
1739
07a484280a82
copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents:
1729
diff
changeset
|
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
0 | 5 * |
429 | 6 * This library is free software; you can redistribute it and/or |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2 of the License, or (at your option) any later version. | |
0 | 10 * |
429 | 11 * This library is distributed in the hope that it will be useful, |
0 | 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 * Lesser General Public License for more details. | |
0 | 15 * |
429 | 16 * You should have received a copy of the GNU Lesser General Public |
17 * License along with this library; if not, write to the Free Software | |
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
0 | 19 * |
20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
21 */ | |
22 | |
23 #include "../dsputil.h" | |
1092 | 24 #include "../simple_idct.h" |
2067 | 25 #include "../mpegvideo.h" |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
26 #include "mmx.h" |
0 | 27 |
1729 | 28 //#undef NDEBUG |
29 //#include <assert.h> | |
30 | |
1647 | 31 extern const uint8_t ff_h263_loop_filter_strength[32]; |
2868 | 32 extern void ff_idct_xvid_mmx(short *block); |
33 extern void ff_idct_xvid_mmx2(short *block); | |
1647 | 34 |
5 | 35 int mm_flags; /* multimedia extension flags */ |
936 | 36 |
0 | 37 /* pixel operations */ |
1845
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
38 static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL; |
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
39 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL; |
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
40 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL; |
0 | 41 |
1845
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
42 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL; |
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
43 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL; |
2633 | 44 static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL; |
2209 | 45 static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL; |
1845
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
46 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL; |
2209 | 47 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL; |
2754 | 48 static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL; |
1845
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
49 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; |
954 | 50 |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
51 static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL; |
1845
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1784
diff
changeset
|
52 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL; |
1647 | 53 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
54 #define JUMPALIGN() __asm __volatile (".balign 8"::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
55 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
56 |
448 | 57 #define MOVQ_WONE(regd) \ |
58 __asm __volatile ( \ | |
59 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
60 "psrlw $15, %%" #regd ::) | |
61 | |
62 #define MOVQ_BFE(regd) \ | |
63 __asm __volatile ( \ | |
64 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ | |
65 "paddb %%" #regd ", %%" #regd " \n\t" ::) | |
66 | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
67 #ifndef PIC |
448 | 68 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone)) |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
69 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
70 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
71 // for shared library it's better to use this way for accessing constants |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
72 // pcmpeqd -> -1 |
448 | 73 #define MOVQ_BONE(regd) \ |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
74 __asm __volatile ( \ |
448 | 75 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
76 "psrlw $15, %%" #regd " \n\t" \ | |
77 "packuswb %%" #regd ", %%" #regd " \n\t" ::) | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
78 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
79 #define MOVQ_WTWO(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
80 __asm __volatile ( \ |
448 | 81 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
82 "psrlw $15, %%" #regd " \n\t" \ | |
83 "psllw $1, %%" #regd " \n\t"::) | |
387 | 84 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
85 #endif |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
86 |
448 | 87 // using regr as temporary and for the output result |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
88 // first argument is unmodifed and second is trashed |
471 | 89 // regfe is supposed to contain 0xfefefefefefefefe |
90 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ | |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
91 "movq " #rega ", " #regr " \n\t"\ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
92 "pand " #regb ", " #regr " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
93 "pxor " #rega ", " #regb " \n\t"\ |
471 | 94 "pand " #regfe "," #regb " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
95 "psrlq $1, " #regb " \n\t"\ |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
96 "paddb " #regb ", " #regr " \n\t" |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
97 |
471 | 98 #define PAVGB_MMX(rega, regb, regr, regfe) \ |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
99 "movq " #rega ", " #regr " \n\t"\ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
100 "por " #regb ", " #regr " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
101 "pxor " #rega ", " #regb " \n\t"\ |
471 | 102 "pand " #regfe "," #regb " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
103 "psrlq $1, " #regb " \n\t"\ |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
104 "psubb " #regb ", " #regr " \n\t" |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
105 |
471 | 106 // mm6 is supposed to contain 0xfefefefefefefefe |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
107 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
108 "movq " #rega ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
109 "movq " #regc ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
110 "pand " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
111 "pand " #regd ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
112 "pxor " #rega ", " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
113 "pxor " #regc ", " #regd " \n\t"\ |
448 | 114 "pand %%mm6, " #regb " \n\t"\ |
115 "pand %%mm6, " #regd " \n\t"\ | |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
116 "psrlq $1, " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
117 "psrlq $1, " #regd " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
118 "paddb " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
119 "paddb " #regd ", " #regp " \n\t" |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
120 |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
121 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
122 "movq " #rega ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
123 "movq " #regc ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
124 "por " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
125 "por " #regd ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
126 "pxor " #rega ", " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
127 "pxor " #regc ", " #regd " \n\t"\ |
448 | 128 "pand %%mm6, " #regb " \n\t"\ |
129 "pand %%mm6, " #regd " \n\t"\ | |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
130 "psrlq $1, " #regd " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
131 "psrlq $1, " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
132 "psubb " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
133 "psubb " #regd ", " #regp " \n\t" |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
134 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
135 /***********************************/ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
136 /* MMX no rounding */ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
137 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx |
448 | 138 #define SET_RND MOVQ_WONE |
139 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) | |
471 | 140 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
141 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
142 #include "dsputil_mmx_rnd.h" |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
143 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
144 #undef DEF |
448 | 145 #undef SET_RND |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
146 #undef PAVGBP |
471 | 147 #undef PAVGB |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
148 /***********************************/ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
149 /* MMX rounding */ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
150 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
151 #define DEF(x, y) x ## _ ## y ##_mmx |
448 | 152 #define SET_RND MOVQ_WTWO |
153 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) | |
471 | 154 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
155 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
156 #include "dsputil_mmx_rnd.h" |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
157 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
158 #undef DEF |
448 | 159 #undef SET_RND |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
160 #undef PAVGBP |
471 | 161 #undef PAVGB |
387 | 162 |
0 | 163 /***********************************/ |
164 /* 3Dnow specific */ | |
165 | |
166 #define DEF(x) x ## _3dnow | |
167 /* for Athlons PAVGUSB is prefered */ | |
168 #define PAVGB "pavgusb" | |
169 | |
170 #include "dsputil_mmx_avg.h" | |
171 | |
172 #undef DEF | |
173 #undef PAVGB | |
174 | |
175 /***********************************/ | |
176 /* MMX2 specific */ | |
177 | |
386 | 178 #define DEF(x) x ## _mmx2 |
0 | 179 |
180 /* Introduced only in MMX2 set */ | |
181 #define PAVGB "pavgb" | |
182 | |
183 #include "dsputil_mmx_avg.h" | |
184 | |
185 #undef DEF | |
186 #undef PAVGB | |
187 | |
188 /***********************************/ | |
189 /* standard MMX */ | |
190 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
191 #ifdef CONFIG_ENCODERS |
1064 | 192 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) |
0 | 193 { |
386 | 194 asm volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
195 "mov $-128, %%"REG_a" \n\t" |
386 | 196 "pxor %%mm7, %%mm7 \n\t" |
197 ".balign 16 \n\t" | |
198 "1: \n\t" | |
199 "movq (%0), %%mm0 \n\t" | |
200 "movq (%0, %2), %%mm2 \n\t" | |
201 "movq %%mm0, %%mm1 \n\t" | |
202 "movq %%mm2, %%mm3 \n\t" | |
203 "punpcklbw %%mm7, %%mm0 \n\t" | |
204 "punpckhbw %%mm7, %%mm1 \n\t" | |
205 "punpcklbw %%mm7, %%mm2 \n\t" | |
206 "punpckhbw %%mm7, %%mm3 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
207 "movq %%mm0, (%1, %%"REG_a")\n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
208 "movq %%mm1, 8(%1, %%"REG_a")\n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
209 "movq %%mm2, 16(%1, %%"REG_a")\n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
210 "movq %%mm3, 24(%1, %%"REG_a")\n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
211 "add %3, %0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
212 "add $32, %%"REG_a" \n\t" |
386 | 213 "js 1b \n\t" |
214 : "+r" (pixels) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
215 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
216 : "%"REG_a |
386 | 217 ); |
0 | 218 } |
219 | |
1064 | 220 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) |
324 | 221 { |
222 asm volatile( | |
386 | 223 "pxor %%mm7, %%mm7 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
224 "mov $-128, %%"REG_a" \n\t" |
324 | 225 ".balign 16 \n\t" |
226 "1: \n\t" | |
227 "movq (%0), %%mm0 \n\t" | |
228 "movq (%1), %%mm2 \n\t" | |
229 "movq %%mm0, %%mm1 \n\t" | |
230 "movq %%mm2, %%mm3 \n\t" | |
231 "punpcklbw %%mm7, %%mm0 \n\t" | |
232 "punpckhbw %%mm7, %%mm1 \n\t" | |
233 "punpcklbw %%mm7, %%mm2 \n\t" | |
234 "punpckhbw %%mm7, %%mm3 \n\t" | |
235 "psubw %%mm2, %%mm0 \n\t" | |
236 "psubw %%mm3, %%mm1 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
237 "movq %%mm0, (%2, %%"REG_a")\n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
238 "movq %%mm1, 8(%2, %%"REG_a")\n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
239 "add %3, %0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
240 "add %3, %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
241 "add $16, %%"REG_a" \n\t" |
324 | 242 "jnz 1b \n\t" |
243 : "+r" (s1), "+r" (s2) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
244 : "r" (block+64), "r" ((long)stride) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
245 : "%"REG_a |
324 | 246 ); |
247 } | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
248 #endif //CONFIG_ENCODERS |
324 | 249 |
1064 | 250 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
0 | 251 { |
252 const DCTELEM *p; | |
1064 | 253 uint8_t *pix; |
0 | 254 |
255 /* read the pixels */ | |
256 p = block; | |
257 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
258 /* unrolled loop */ |
0 | 259 __asm __volatile( |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
260 "movq %3, %%mm0\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
261 "movq 8%3, %%mm1\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
262 "movq 16%3, %%mm2\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
263 "movq 24%3, %%mm3\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
264 "movq 32%3, %%mm4\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
265 "movq 40%3, %%mm5\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
266 "movq 48%3, %%mm6\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
267 "movq 56%3, %%mm7\n\t" |
0 | 268 "packuswb %%mm1, %%mm0\n\t" |
269 "packuswb %%mm3, %%mm2\n\t" | |
270 "packuswb %%mm5, %%mm4\n\t" | |
271 "packuswb %%mm7, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
272 "movq %%mm0, (%0)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
273 "movq %%mm2, (%0, %1)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
274 "movq %%mm4, (%0, %1, 2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
275 "movq %%mm6, (%0, %2)\n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
276 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p) |
0 | 277 :"memory"); |
278 pix += line_size*4; | |
279 p += 32; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
280 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
281 // if here would be an exact copy of the code above |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
282 // compiler would generate some very strange code |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
283 // thus using "r" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
284 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
285 "movq (%3), %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
286 "movq 8(%3), %%mm1\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
287 "movq 16(%3), %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
288 "movq 24(%3), %%mm3\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
289 "movq 32(%3), %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
290 "movq 40(%3), %%mm5\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
291 "movq 48(%3), %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
292 "movq 56(%3), %%mm7\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
293 "packuswb %%mm1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
294 "packuswb %%mm3, %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
295 "packuswb %%mm5, %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
296 "packuswb %%mm7, %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
297 "movq %%mm0, (%0)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
298 "movq %%mm2, (%0, %1)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
299 "movq %%mm4, (%0, %1, 2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
300 "movq %%mm6, (%0, %2)\n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
301 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p) |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
302 :"memory"); |
0 | 303 } |
304 | |
2753 | 305 static const unsigned char __align8 vector128[8] = |
1985
b2bc62fdecc0
move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents:
1984
diff
changeset
|
306 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; |
b2bc62fdecc0
move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents:
1984
diff
changeset
|
307 |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
308 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
309 { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
310 int i; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
311 |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
312 movq_m2r(*vector128, mm1); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
313 for (i = 0; i < 8; i++) { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
314 movq_m2r(*(block), mm0); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
315 packsswb_m2r(*(block + 4), mm0); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
316 block += 8; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
317 paddb_r2r(mm1, mm0); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
318 movq_r2m(mm0, *pixels); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
319 pixels += line_size; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
320 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
321 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
322 |
1064 | 323 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
0 | 324 { |
325 const DCTELEM *p; | |
1064 | 326 uint8_t *pix; |
0 | 327 int i; |
328 | |
329 /* read the pixels */ | |
330 p = block; | |
331 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
332 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
333 i = 4; |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
334 do { |
0 | 335 __asm __volatile( |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
336 "movq (%2), %%mm0\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
337 "movq 8(%2), %%mm1\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
338 "movq 16(%2), %%mm2\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
339 "movq 24(%2), %%mm3\n\t" |
0 | 340 "movq %0, %%mm4\n\t" |
341 "movq %1, %%mm6\n\t" | |
342 "movq %%mm4, %%mm5\n\t" | |
343 "punpcklbw %%mm7, %%mm4\n\t" | |
344 "punpckhbw %%mm7, %%mm5\n\t" | |
345 "paddsw %%mm4, %%mm0\n\t" | |
346 "paddsw %%mm5, %%mm1\n\t" | |
347 "movq %%mm6, %%mm5\n\t" | |
348 "punpcklbw %%mm7, %%mm6\n\t" | |
349 "punpckhbw %%mm7, %%mm5\n\t" | |
350 "paddsw %%mm6, %%mm2\n\t" | |
351 "paddsw %%mm5, %%mm3\n\t" | |
352 "packuswb %%mm1, %%mm0\n\t" | |
353 "packuswb %%mm3, %%mm2\n\t" | |
354 "movq %%mm0, %0\n\t" | |
355 "movq %%mm2, %1\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
356 :"+m"(*pix), "+m"(*(pix+line_size)) |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
357 :"r"(p) |
0 | 358 :"memory"); |
359 pix += line_size*2; | |
360 p += 16; | |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
361 } while (--i); |
0 | 362 } |
363 | |
2209 | 364 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
365 { | |
366 __asm __volatile( | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
367 "lea (%3, %3), %%"REG_a" \n\t" |
2209 | 368 ".balign 8 \n\t" |
369 "1: \n\t" | |
370 "movd (%1), %%mm0 \n\t" | |
371 "movd (%1, %3), %%mm1 \n\t" | |
372 "movd %%mm0, (%2) \n\t" | |
373 "movd %%mm1, (%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
374 "add %%"REG_a", %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
375 "add %%"REG_a", %2 \n\t" |
2209 | 376 "movd (%1), %%mm0 \n\t" |
377 "movd (%1, %3), %%mm1 \n\t" | |
378 "movd %%mm0, (%2) \n\t" | |
379 "movd %%mm1, (%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
380 "add %%"REG_a", %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
381 "add %%"REG_a", %2 \n\t" |
2209 | 382 "subl $4, %0 \n\t" |
383 "jnz 1b \n\t" | |
384 : "+g"(h), "+r" (pixels), "+r" (block) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
385 : "r"((long)line_size) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
386 : "%"REG_a, "memory" |
2209 | 387 ); |
388 } | |
389 | |
1064 | 390 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 391 { |
471 | 392 __asm __volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
393 "lea (%3, %3), %%"REG_a" \n\t" |
422 | 394 ".balign 8 \n\t" |
420 | 395 "1: \n\t" |
396 "movq (%1), %%mm0 \n\t" | |
397 "movq (%1, %3), %%mm1 \n\t" | |
398 "movq %%mm0, (%2) \n\t" | |
399 "movq %%mm1, (%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
400 "add %%"REG_a", %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
401 "add %%"REG_a", %2 \n\t" |
420 | 402 "movq (%1), %%mm0 \n\t" |
403 "movq (%1, %3), %%mm1 \n\t" | |
404 "movq %%mm0, (%2) \n\t" | |
405 "movq %%mm1, (%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
406 "add %%"REG_a", %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
407 "add %%"REG_a", %2 \n\t" |
420 | 408 "subl $4, %0 \n\t" |
409 "jnz 1b \n\t" | |
410 : "+g"(h), "+r" (pixels), "+r" (block) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
411 : "r"((long)line_size) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
412 : "%"REG_a, "memory" |
420 | 413 ); |
0 | 414 } |
415 | |
1064 | 416 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
651 | 417 { |
418 __asm __volatile( | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
419 "lea (%3, %3), %%"REG_a" \n\t" |
651 | 420 ".balign 8 \n\t" |
421 "1: \n\t" | |
422 "movq (%1), %%mm0 \n\t" | |
423 "movq 8(%1), %%mm4 \n\t" | |
424 "movq (%1, %3), %%mm1 \n\t" | |
425 "movq 8(%1, %3), %%mm5 \n\t" | |
426 "movq %%mm0, (%2) \n\t" | |
427 "movq %%mm4, 8(%2) \n\t" | |
428 "movq %%mm1, (%2, %3) \n\t" | |
429 "movq %%mm5, 8(%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
430 "add %%"REG_a", %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
431 "add %%"REG_a", %2 \n\t" |
651 | 432 "movq (%1), %%mm0 \n\t" |
433 "movq 8(%1), %%mm4 \n\t" | |
434 "movq (%1, %3), %%mm1 \n\t" | |
435 "movq 8(%1, %3), %%mm5 \n\t" | |
436 "movq %%mm0, (%2) \n\t" | |
437 "movq %%mm4, 8(%2) \n\t" | |
438 "movq %%mm1, (%2, %3) \n\t" | |
439 "movq %%mm5, 8(%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
440 "add %%"REG_a", %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
441 "add %%"REG_a", %2 \n\t" |
651 | 442 "subl $4, %0 \n\t" |
443 "jnz 1b \n\t" | |
444 : "+g"(h), "+r" (pixels), "+r" (block) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
445 : "r"((long)line_size) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
446 : "%"REG_a, "memory" |
651 | 447 ); |
448 } | |
449 | |
296 | 450 static void clear_blocks_mmx(DCTELEM *blocks) |
451 { | |
471 | 452 __asm __volatile( |
296 | 453 "pxor %%mm7, %%mm7 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
454 "mov $-128*6, %%"REG_a" \n\t" |
296 | 455 "1: \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
456 "movq %%mm7, (%0, %%"REG_a") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
457 "movq %%mm7, 8(%0, %%"REG_a") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
458 "movq %%mm7, 16(%0, %%"REG_a") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
459 "movq %%mm7, 24(%0, %%"REG_a") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
460 "add $32, %%"REG_a" \n\t" |
296 | 461 " js 1b \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
462 : : "r" (((uint8_t *)blocks)+128*6) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
463 : "%"REG_a |
296 | 464 ); |
465 } | |
466 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
467 #ifdef CONFIG_ENCODERS |
1064 | 468 static int pix_sum16_mmx(uint8_t * pix, int line_size){ |
688 | 469 const int h=16; |
470 int sum; | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
471 long index= -line_size*h; |
688 | 472 |
473 __asm __volatile( | |
474 "pxor %%mm7, %%mm7 \n\t" | |
475 "pxor %%mm6, %%mm6 \n\t" | |
476 "1: \n\t" | |
477 "movq (%2, %1), %%mm0 \n\t" | |
478 "movq (%2, %1), %%mm1 \n\t" | |
479 "movq 8(%2, %1), %%mm2 \n\t" | |
480 "movq 8(%2, %1), %%mm3 \n\t" | |
481 "punpcklbw %%mm7, %%mm0 \n\t" | |
482 "punpckhbw %%mm7, %%mm1 \n\t" | |
483 "punpcklbw %%mm7, %%mm2 \n\t" | |
484 "punpckhbw %%mm7, %%mm3 \n\t" | |
485 "paddw %%mm0, %%mm1 \n\t" | |
486 "paddw %%mm2, %%mm3 \n\t" | |
487 "paddw %%mm1, %%mm3 \n\t" | |
488 "paddw %%mm3, %%mm6 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
489 "add %3, %1 \n\t" |
688 | 490 " js 1b \n\t" |
491 "movq %%mm6, %%mm5 \n\t" | |
492 "psrlq $32, %%mm6 \n\t" | |
493 "paddw %%mm5, %%mm6 \n\t" | |
494 "movq %%mm6, %%mm5 \n\t" | |
495 "psrlq $16, %%mm6 \n\t" | |
496 "paddw %%mm5, %%mm6 \n\t" | |
497 "movd %%mm6, %0 \n\t" | |
498 "andl $0xFFFF, %0 \n\t" | |
499 : "=&r" (sum), "+r" (index) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
500 : "r" (pix - index), "r" ((long)line_size) |
688 | 501 ); |
502 | |
503 return sum; | |
504 } | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
505 #endif //CONFIG_ENCODERS |
688 | 506 |
866 | 507 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
508 long i=0; |
866 | 509 asm volatile( |
510 "1: \n\t" | |
511 "movq (%1, %0), %%mm0 \n\t" | |
512 "movq (%2, %0), %%mm1 \n\t" | |
513 "paddb %%mm0, %%mm1 \n\t" | |
514 "movq %%mm1, (%2, %0) \n\t" | |
515 "movq 8(%1, %0), %%mm0 \n\t" | |
516 "movq 8(%2, %0), %%mm1 \n\t" | |
517 "paddb %%mm0, %%mm1 \n\t" | |
518 "movq %%mm1, 8(%2, %0) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
519 "add $16, %0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
520 "cmp %3, %0 \n\t" |
866 | 521 " jb 1b \n\t" |
522 : "+r" (i) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
523 : "r"(src), "r"(dst), "r"((long)w-15) |
866 | 524 ); |
525 for(; i<w; i++) | |
526 dst[i+0] += src[i+0]; | |
527 } | |
528 | |
1648 | 529 #define H263_LOOP_FILTER \ |
530 "pxor %%mm7, %%mm7 \n\t"\ | |
531 "movq %0, %%mm0 \n\t"\ | |
532 "movq %0, %%mm1 \n\t"\ | |
533 "movq %3, %%mm2 \n\t"\ | |
534 "movq %3, %%mm3 \n\t"\ | |
535 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
536 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
537 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
538 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
539 "psubw %%mm2, %%mm0 \n\t"\ | |
540 "psubw %%mm3, %%mm1 \n\t"\ | |
541 "movq %1, %%mm2 \n\t"\ | |
542 "movq %1, %%mm3 \n\t"\ | |
543 "movq %2, %%mm4 \n\t"\ | |
544 "movq %2, %%mm5 \n\t"\ | |
545 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
546 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
547 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
548 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
549 "psubw %%mm2, %%mm4 \n\t"\ | |
550 "psubw %%mm3, %%mm5 \n\t"\ | |
551 "psllw $2, %%mm4 \n\t"\ | |
552 "psllw $2, %%mm5 \n\t"\ | |
553 "paddw %%mm0, %%mm4 \n\t"\ | |
554 "paddw %%mm1, %%mm5 \n\t"\ | |
555 "pxor %%mm6, %%mm6 \n\t"\ | |
556 "pcmpgtw %%mm4, %%mm6 \n\t"\ | |
557 "pcmpgtw %%mm5, %%mm7 \n\t"\ | |
558 "pxor %%mm6, %%mm4 \n\t"\ | |
559 "pxor %%mm7, %%mm5 \n\t"\ | |
560 "psubw %%mm6, %%mm4 \n\t"\ | |
561 "psubw %%mm7, %%mm5 \n\t"\ | |
562 "psrlw $3, %%mm4 \n\t"\ | |
563 "psrlw $3, %%mm5 \n\t"\ | |
564 "packuswb %%mm5, %%mm4 \n\t"\ | |
565 "packsswb %%mm7, %%mm6 \n\t"\ | |
566 "pxor %%mm7, %%mm7 \n\t"\ | |
567 "movd %4, %%mm2 \n\t"\ | |
568 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
569 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
570 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
571 "psubusb %%mm4, %%mm2 \n\t"\ | |
572 "movq %%mm2, %%mm3 \n\t"\ | |
573 "psubusb %%mm4, %%mm3 \n\t"\ | |
574 "psubb %%mm3, %%mm2 \n\t"\ | |
575 "movq %1, %%mm3 \n\t"\ | |
576 "movq %2, %%mm4 \n\t"\ | |
577 "pxor %%mm6, %%mm3 \n\t"\ | |
578 "pxor %%mm6, %%mm4 \n\t"\ | |
579 "paddusb %%mm2, %%mm3 \n\t"\ | |
580 "psubusb %%mm2, %%mm4 \n\t"\ | |
581 "pxor %%mm6, %%mm3 \n\t"\ | |
582 "pxor %%mm6, %%mm4 \n\t"\ | |
583 "paddusb %%mm2, %%mm2 \n\t"\ | |
584 "packsswb %%mm1, %%mm0 \n\t"\ | |
585 "pcmpgtb %%mm0, %%mm7 \n\t"\ | |
586 "pxor %%mm7, %%mm0 \n\t"\ | |
587 "psubb %%mm7, %%mm0 \n\t"\ | |
588 "movq %%mm0, %%mm1 \n\t"\ | |
589 "psubusb %%mm2, %%mm0 \n\t"\ | |
590 "psubb %%mm0, %%mm1 \n\t"\ | |
591 "pand %5, %%mm1 \n\t"\ | |
592 "psrlw $2, %%mm1 \n\t"\ | |
593 "pxor %%mm7, %%mm1 \n\t"\ | |
594 "psubb %%mm7, %%mm1 \n\t"\ | |
595 "movq %0, %%mm5 \n\t"\ | |
596 "movq %3, %%mm6 \n\t"\ | |
597 "psubb %%mm1, %%mm5 \n\t"\ | |
598 "paddb %%mm1, %%mm6 \n\t" | |
599 | |
1647 | 600 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ |
601 const int strength= ff_h263_loop_filter_strength[qscale]; | |
602 | |
603 asm volatile( | |
1648 | 604 |
605 H263_LOOP_FILTER | |
606 | |
1647 | 607 "movq %%mm3, %1 \n\t" |
608 "movq %%mm4, %2 \n\t" | |
1648 | 609 "movq %%mm5, %0 \n\t" |
610 "movq %%mm6, %3 \n\t" | |
1647 | 611 : "+m" (*(uint64_t*)(src - 2*stride)), |
612 "+m" (*(uint64_t*)(src - 1*stride)), | |
613 "+m" (*(uint64_t*)(src + 0*stride)), | |
614 "+m" (*(uint64_t*)(src + 1*stride)) | |
615 : "g" (2*strength), "m"(ff_pb_FC) | |
616 ); | |
617 } | |
618 | |
1648 | 619 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ |
620 asm volatile( //FIXME could save 1 instruction if done as 8x4 ... | |
621 "movd %4, %%mm0 \n\t" | |
622 "movd %5, %%mm1 \n\t" | |
623 "movd %6, %%mm2 \n\t" | |
624 "movd %7, %%mm3 \n\t" | |
625 "punpcklbw %%mm1, %%mm0 \n\t" | |
626 "punpcklbw %%mm3, %%mm2 \n\t" | |
627 "movq %%mm0, %%mm1 \n\t" | |
628 "punpcklwd %%mm2, %%mm0 \n\t" | |
629 "punpckhwd %%mm2, %%mm1 \n\t" | |
630 "movd %%mm0, %0 \n\t" | |
631 "punpckhdq %%mm0, %%mm0 \n\t" | |
632 "movd %%mm0, %1 \n\t" | |
633 "movd %%mm1, %2 \n\t" | |
634 "punpckhdq %%mm1, %%mm1 \n\t" | |
635 "movd %%mm1, %3 \n\t" | |
636 | |
637 : "=m" (*(uint32_t*)(dst + 0*dst_stride)), | |
638 "=m" (*(uint32_t*)(dst + 1*dst_stride)), | |
639 "=m" (*(uint32_t*)(dst + 2*dst_stride)), | |
640 "=m" (*(uint32_t*)(dst + 3*dst_stride)) | |
641 : "m" (*(uint32_t*)(src + 0*src_stride)), | |
642 "m" (*(uint32_t*)(src + 1*src_stride)), | |
643 "m" (*(uint32_t*)(src + 2*src_stride)), | |
644 "m" (*(uint32_t*)(src + 3*src_stride)) | |
645 ); | |
646 } | |
647 | |
648 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ | |
649 const int strength= ff_h263_loop_filter_strength[qscale]; | |
650 uint64_t temp[4] __attribute__ ((aligned(8))); | |
651 uint8_t *btemp= (uint8_t*)temp; | |
652 | |
653 src -= 2; | |
654 | |
655 transpose4x4(btemp , src , 8, stride); | |
656 transpose4x4(btemp+4, src + 4*stride, 8, stride); | |
657 asm volatile( | |
658 H263_LOOP_FILTER // 5 3 4 6 | |
659 | |
660 : "+m" (temp[0]), | |
661 "+m" (temp[1]), | |
662 "+m" (temp[2]), | |
663 "+m" (temp[3]) | |
664 : "g" (2*strength), "m"(ff_pb_FC) | |
665 ); | |
666 | |
667 asm volatile( | |
668 "movq %%mm5, %%mm1 \n\t" | |
669 "movq %%mm4, %%mm0 \n\t" | |
670 "punpcklbw %%mm3, %%mm5 \n\t" | |
671 "punpcklbw %%mm6, %%mm4 \n\t" | |
672 "punpckhbw %%mm3, %%mm1 \n\t" | |
673 "punpckhbw %%mm6, %%mm0 \n\t" | |
674 "movq %%mm5, %%mm3 \n\t" | |
675 "movq %%mm1, %%mm6 \n\t" | |
676 "punpcklwd %%mm4, %%mm5 \n\t" | |
677 "punpcklwd %%mm0, %%mm1 \n\t" | |
678 "punpckhwd %%mm4, %%mm3 \n\t" | |
679 "punpckhwd %%mm0, %%mm6 \n\t" | |
2505
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
680 "movd %%mm5, (%0) \n\t" |
1648 | 681 "punpckhdq %%mm5, %%mm5 \n\t" |
2505
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
682 "movd %%mm5, (%0,%2) \n\t" |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
683 "movd %%mm3, (%0,%2,2) \n\t" |
1648 | 684 "punpckhdq %%mm3, %%mm3 \n\t" |
2505
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
685 "movd %%mm3, (%0,%3) \n\t" |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
686 "movd %%mm1, (%1) \n\t" |
1648 | 687 "punpckhdq %%mm1, %%mm1 \n\t" |
2505
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
688 "movd %%mm1, (%1,%2) \n\t" |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
689 "movd %%mm6, (%1,%2,2) \n\t" |
1648 | 690 "punpckhdq %%mm6, %%mm6 \n\t" |
2505
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
691 "movd %%mm6, (%1,%3) \n\t" |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
692 :: "r" (src), |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
693 "r" (src + 4*stride), |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
694 "r" ((long) stride ), |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
695 "r" ((long)(3*stride)) |
1648 | 696 ); |
697 } | |
698 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
699 #ifdef CONFIG_ENCODERS |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
700 static int pix_norm1_mmx(uint8_t *pix, int line_size) { |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
701 int tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
702 asm volatile ( |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
703 "movl $16,%%ecx\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
704 "pxor %%mm0,%%mm0\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
705 "pxor %%mm7,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
706 "1:\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
707 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
708 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
709 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
710 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
711 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
712 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
713 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
714 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
715 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
716 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
717 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
718 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
719 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
720 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
721 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
722 "pmaddwd %%mm3,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
723 "pmaddwd %%mm4,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
724 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
725 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
726 pix2^2+pix3^2+pix6^2+pix7^2) */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
727 "paddd %%mm3,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
728 "paddd %%mm2,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
729 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
730 "add %2, %0\n" |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
731 "paddd %%mm4,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
732 "dec %%ecx\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
733 "jnz 1b\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
734 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
735 "movq %%mm7,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
736 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
737 "paddd %%mm7,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
738 "movd %%mm1,%1\n" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
739 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" ); |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
740 return tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
741 } |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
742 |
2067 | 743 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
744 int tmp; | |
745 asm volatile ( | |
746 "movl %4,%%ecx\n" | |
747 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ | |
748 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ | |
749 "1:\n" | |
750 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ | |
751 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ | |
752 | |
753 "movq %%mm1,%%mm5\n" | |
754 "psubusb %%mm2,%%mm1\n" | |
755 "psubusb %%mm5,%%mm2\n" | |
756 | |
757 "por %%mm1,%%mm2\n" | |
758 | |
759 "movq %%mm2,%%mm1\n" | |
760 | |
761 "punpckhbw %%mm0,%%mm2\n" | |
762 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ | |
763 | |
764 "pmaddwd %%mm2,%%mm2\n" | |
765 "pmaddwd %%mm1,%%mm1\n" | |
766 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
767 "add %3,%0\n" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
768 "add %3,%1\n" |
2067 | 769 |
770 "paddd %%mm2,%%mm1\n" | |
771 "paddd %%mm1,%%mm7\n" | |
772 | |
773 "decl %%ecx\n" | |
774 "jnz 1b\n" | |
775 | |
776 "movq %%mm7,%%mm1\n" | |
777 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ | |
778 "paddd %%mm7,%%mm1\n" | |
779 "movd %%mm1,%2\n" | |
780 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
781 : "r" ((long)line_size) , "m" (h) |
2067 | 782 : "%ecx"); |
783 return tmp; | |
784 } | |
785 | |
1708 | 786 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
787 int tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
788 asm volatile ( |
1708 | 789 "movl %4,%%ecx\n" |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
790 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
791 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
792 "1:\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
793 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
794 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
795 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
796 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
797 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
798 /* todo: mm1-mm2, mm3-mm4 */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
799 /* algo: substract mm1 from mm2 with saturation and vice versa */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
800 /* OR the results to get absolute difference */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
801 "movq %%mm1,%%mm5\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
802 "movq %%mm3,%%mm6\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
803 "psubusb %%mm2,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
804 "psubusb %%mm4,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
805 "psubusb %%mm5,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
806 "psubusb %%mm6,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
807 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
808 "por %%mm1,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
809 "por %%mm3,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
810 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
811 /* now convert to 16-bit vectors so we can square them */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
812 "movq %%mm2,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
813 "movq %%mm4,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
814 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
815 "punpckhbw %%mm0,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
816 "punpckhbw %%mm0,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
817 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
818 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
819 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
820 "pmaddwd %%mm2,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
821 "pmaddwd %%mm4,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
822 "pmaddwd %%mm1,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
823 "pmaddwd %%mm3,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
824 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
825 "add %3,%0\n" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
826 "add %3,%1\n" |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
827 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
828 "paddd %%mm2,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
829 "paddd %%mm4,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
830 "paddd %%mm1,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
831 "paddd %%mm3,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
832 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
833 "decl %%ecx\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
834 "jnz 1b\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
835 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
836 "movq %%mm7,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
837 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
838 "paddd %%mm7,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
839 "movd %%mm1,%2\n" |
1708 | 840 : "+r" (pix1), "+r" (pix2), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
841 : "r" ((long)line_size) , "m" (h) |
1708 | 842 : "%ecx"); |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
843 return tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
844 } |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
845 |
2067 | 846 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { |
847 int tmp; | |
848 asm volatile ( | |
849 "movl %3,%%ecx\n" | |
850 "pxor %%mm7,%%mm7\n" | |
851 "pxor %%mm6,%%mm6\n" | |
852 | |
853 "movq (%0),%%mm0\n" | |
854 "movq %%mm0, %%mm1\n" | |
855 "psllq $8, %%mm0\n" | |
856 "psrlq $8, %%mm1\n" | |
857 "psrlq $8, %%mm0\n" | |
858 "movq %%mm0, %%mm2\n" | |
859 "movq %%mm1, %%mm3\n" | |
860 "punpcklbw %%mm7,%%mm0\n" | |
861 "punpcklbw %%mm7,%%mm1\n" | |
862 "punpckhbw %%mm7,%%mm2\n" | |
863 "punpckhbw %%mm7,%%mm3\n" | |
864 "psubw %%mm1, %%mm0\n" | |
865 "psubw %%mm3, %%mm2\n" | |
866 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
867 "add %2,%0\n" |
2067 | 868 |
869 "movq (%0),%%mm4\n" | |
870 "movq %%mm4, %%mm1\n" | |
871 "psllq $8, %%mm4\n" | |
872 "psrlq $8, %%mm1\n" | |
873 "psrlq $8, %%mm4\n" | |
874 "movq %%mm4, %%mm5\n" | |
875 "movq %%mm1, %%mm3\n" | |
876 "punpcklbw %%mm7,%%mm4\n" | |
877 "punpcklbw %%mm7,%%mm1\n" | |
878 "punpckhbw %%mm7,%%mm5\n" | |
879 "punpckhbw %%mm7,%%mm3\n" | |
880 "psubw %%mm1, %%mm4\n" | |
881 "psubw %%mm3, %%mm5\n" | |
882 "psubw %%mm4, %%mm0\n" | |
883 "psubw %%mm5, %%mm2\n" | |
884 "pxor %%mm3, %%mm3\n" | |
885 "pxor %%mm1, %%mm1\n" | |
886 "pcmpgtw %%mm0, %%mm3\n\t" | |
887 "pcmpgtw %%mm2, %%mm1\n\t" | |
888 "pxor %%mm3, %%mm0\n" | |
889 "pxor %%mm1, %%mm2\n" | |
890 "psubw %%mm3, %%mm0\n" | |
891 "psubw %%mm1, %%mm2\n" | |
892 "paddw %%mm0, %%mm2\n" | |
893 "paddw %%mm2, %%mm6\n" | |
894 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
895 "add %2,%0\n" |
2067 | 896 "1:\n" |
897 | |
898 "movq (%0),%%mm0\n" | |
899 "movq %%mm0, %%mm1\n" | |
900 "psllq $8, %%mm0\n" | |
901 "psrlq $8, %%mm1\n" | |
902 "psrlq $8, %%mm0\n" | |
903 "movq %%mm0, %%mm2\n" | |
904 "movq %%mm1, %%mm3\n" | |
905 "punpcklbw %%mm7,%%mm0\n" | |
906 "punpcklbw %%mm7,%%mm1\n" | |
907 "punpckhbw %%mm7,%%mm2\n" | |
908 "punpckhbw %%mm7,%%mm3\n" | |
909 "psubw %%mm1, %%mm0\n" | |
910 "psubw %%mm3, %%mm2\n" | |
911 "psubw %%mm0, %%mm4\n" | |
912 "psubw %%mm2, %%mm5\n" | |
913 "pxor %%mm3, %%mm3\n" | |
914 "pxor %%mm1, %%mm1\n" | |
915 "pcmpgtw %%mm4, %%mm3\n\t" | |
916 "pcmpgtw %%mm5, %%mm1\n\t" | |
917 "pxor %%mm3, %%mm4\n" | |
918 "pxor %%mm1, %%mm5\n" | |
919 "psubw %%mm3, %%mm4\n" | |
920 "psubw %%mm1, %%mm5\n" | |
921 "paddw %%mm4, %%mm5\n" | |
922 "paddw %%mm5, %%mm6\n" | |
923 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
924 "add %2,%0\n" |
2067 | 925 |
926 "movq (%0),%%mm4\n" | |
927 "movq %%mm4, %%mm1\n" | |
928 "psllq $8, %%mm4\n" | |
929 "psrlq $8, %%mm1\n" | |
930 "psrlq $8, %%mm4\n" | |
931 "movq %%mm4, %%mm5\n" | |
932 "movq %%mm1, %%mm3\n" | |
933 "punpcklbw %%mm7,%%mm4\n" | |
934 "punpcklbw %%mm7,%%mm1\n" | |
935 "punpckhbw %%mm7,%%mm5\n" | |
936 "punpckhbw %%mm7,%%mm3\n" | |
937 "psubw %%mm1, %%mm4\n" | |
938 "psubw %%mm3, %%mm5\n" | |
939 "psubw %%mm4, %%mm0\n" | |
940 "psubw %%mm5, %%mm2\n" | |
941 "pxor %%mm3, %%mm3\n" | |
942 "pxor %%mm1, %%mm1\n" | |
943 "pcmpgtw %%mm0, %%mm3\n\t" | |
944 "pcmpgtw %%mm2, %%mm1\n\t" | |
945 "pxor %%mm3, %%mm0\n" | |
946 "pxor %%mm1, %%mm2\n" | |
947 "psubw %%mm3, %%mm0\n" | |
948 "psubw %%mm1, %%mm2\n" | |
949 "paddw %%mm0, %%mm2\n" | |
950 "paddw %%mm2, %%mm6\n" | |
951 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
952 "add %2,%0\n" |
2067 | 953 "subl $2, %%ecx\n" |
954 " jnz 1b\n" | |
955 | |
956 "movq %%mm6, %%mm0\n" | |
957 "punpcklwd %%mm7,%%mm0\n" | |
958 "punpckhwd %%mm7,%%mm6\n" | |
959 "paddd %%mm0, %%mm6\n" | |
960 | |
961 "movq %%mm6,%%mm0\n" | |
962 "psrlq $32, %%mm6\n" | |
963 "paddd %%mm6,%%mm0\n" | |
964 "movd %%mm0,%1\n" | |
965 : "+r" (pix1), "=r"(tmp) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
966 : "r" ((long)line_size) , "g" (h-2) |
2067 | 967 : "%ecx"); |
968 return tmp; | |
969 } | |
970 | |
971 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { | |
972 int tmp; | |
973 uint8_t * pix= pix1; | |
974 asm volatile ( | |
975 "movl %3,%%ecx\n" | |
976 "pxor %%mm7,%%mm7\n" | |
977 "pxor %%mm6,%%mm6\n" | |
978 | |
979 "movq (%0),%%mm0\n" | |
980 "movq 1(%0),%%mm1\n" | |
981 "movq %%mm0, %%mm2\n" | |
982 "movq %%mm1, %%mm3\n" | |
983 "punpcklbw %%mm7,%%mm0\n" | |
984 "punpcklbw %%mm7,%%mm1\n" | |
985 "punpckhbw %%mm7,%%mm2\n" | |
986 "punpckhbw %%mm7,%%mm3\n" | |
987 "psubw %%mm1, %%mm0\n" | |
988 "psubw %%mm3, %%mm2\n" | |
989 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
990 "add %2,%0\n" |
2067 | 991 |
992 "movq (%0),%%mm4\n" | |
993 "movq 1(%0),%%mm1\n" | |
994 "movq %%mm4, %%mm5\n" | |
995 "movq %%mm1, %%mm3\n" | |
996 "punpcklbw %%mm7,%%mm4\n" | |
997 "punpcklbw %%mm7,%%mm1\n" | |
998 "punpckhbw %%mm7,%%mm5\n" | |
999 "punpckhbw %%mm7,%%mm3\n" | |
1000 "psubw %%mm1, %%mm4\n" | |
1001 "psubw %%mm3, %%mm5\n" | |
1002 "psubw %%mm4, %%mm0\n" | |
1003 "psubw %%mm5, %%mm2\n" | |
1004 "pxor %%mm3, %%mm3\n" | |
1005 "pxor %%mm1, %%mm1\n" | |
1006 "pcmpgtw %%mm0, %%mm3\n\t" | |
1007 "pcmpgtw %%mm2, %%mm1\n\t" | |
1008 "pxor %%mm3, %%mm0\n" | |
1009 "pxor %%mm1, %%mm2\n" | |
1010 "psubw %%mm3, %%mm0\n" | |
1011 "psubw %%mm1, %%mm2\n" | |
1012 "paddw %%mm0, %%mm2\n" | |
1013 "paddw %%mm2, %%mm6\n" | |
1014 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1015 "add %2,%0\n" |
2067 | 1016 "1:\n" |
1017 | |
1018 "movq (%0),%%mm0\n" | |
1019 "movq 1(%0),%%mm1\n" | |
1020 "movq %%mm0, %%mm2\n" | |
1021 "movq %%mm1, %%mm3\n" | |
1022 "punpcklbw %%mm7,%%mm0\n" | |
1023 "punpcklbw %%mm7,%%mm1\n" | |
1024 "punpckhbw %%mm7,%%mm2\n" | |
1025 "punpckhbw %%mm7,%%mm3\n" | |
1026 "psubw %%mm1, %%mm0\n" | |
1027 "psubw %%mm3, %%mm2\n" | |
1028 "psubw %%mm0, %%mm4\n" | |
1029 "psubw %%mm2, %%mm5\n" | |
1030 "pxor %%mm3, %%mm3\n" | |
1031 "pxor %%mm1, %%mm1\n" | |
1032 "pcmpgtw %%mm4, %%mm3\n\t" | |
1033 "pcmpgtw %%mm5, %%mm1\n\t" | |
1034 "pxor %%mm3, %%mm4\n" | |
1035 "pxor %%mm1, %%mm5\n" | |
1036 "psubw %%mm3, %%mm4\n" | |
1037 "psubw %%mm1, %%mm5\n" | |
1038 "paddw %%mm4, %%mm5\n" | |
1039 "paddw %%mm5, %%mm6\n" | |
1040 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1041 "add %2,%0\n" |
2067 | 1042 |
1043 "movq (%0),%%mm4\n" | |
1044 "movq 1(%0),%%mm1\n" | |
1045 "movq %%mm4, %%mm5\n" | |
1046 "movq %%mm1, %%mm3\n" | |
1047 "punpcklbw %%mm7,%%mm4\n" | |
1048 "punpcklbw %%mm7,%%mm1\n" | |
1049 "punpckhbw %%mm7,%%mm5\n" | |
1050 "punpckhbw %%mm7,%%mm3\n" | |
1051 "psubw %%mm1, %%mm4\n" | |
1052 "psubw %%mm3, %%mm5\n" | |
1053 "psubw %%mm4, %%mm0\n" | |
1054 "psubw %%mm5, %%mm2\n" | |
1055 "pxor %%mm3, %%mm3\n" | |
1056 "pxor %%mm1, %%mm1\n" | |
1057 "pcmpgtw %%mm0, %%mm3\n\t" | |
1058 "pcmpgtw %%mm2, %%mm1\n\t" | |
1059 "pxor %%mm3, %%mm0\n" | |
1060 "pxor %%mm1, %%mm2\n" | |
1061 "psubw %%mm3, %%mm0\n" | |
1062 "psubw %%mm1, %%mm2\n" | |
1063 "paddw %%mm0, %%mm2\n" | |
1064 "paddw %%mm2, %%mm6\n" | |
1065 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1066 "add %2,%0\n" |
2067 | 1067 "subl $2, %%ecx\n" |
1068 " jnz 1b\n" | |
1069 | |
1070 "movq %%mm6, %%mm0\n" | |
1071 "punpcklwd %%mm7,%%mm0\n" | |
1072 "punpckhwd %%mm7,%%mm6\n" | |
1073 "paddd %%mm0, %%mm6\n" | |
1074 | |
1075 "movq %%mm6,%%mm0\n" | |
1076 "psrlq $32, %%mm6\n" | |
1077 "paddd %%mm6,%%mm0\n" | |
1078 "movd %%mm0,%1\n" | |
1079 : "+r" (pix1), "=r"(tmp) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1080 : "r" ((long)line_size) , "g" (h-2) |
2067 | 1081 : "%ecx"); |
1082 return tmp + hf_noise8_mmx(pix+8, line_size, h); | |
1083 } | |
1084 | |
2864
95bac7109ff0
Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents:
2754
diff
changeset
|
1085 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
95bac7109ff0
Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents:
2754
diff
changeset
|
1086 MpegEncContext *c = p; |
2067 | 1087 int score1= sse16_mmx(c, pix1, pix2, line_size, h); |
1088 int score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); | |
1089 | |
1090 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight; | |
1091 else return score1 + ABS(score2)*8; | |
1092 } | |
1093 | |
2864
95bac7109ff0
Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents:
2754
diff
changeset
|
1094 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
95bac7109ff0
Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents:
2754
diff
changeset
|
1095 MpegEncContext *c = p; |
2067 | 1096 int score1= sse8_mmx(c, pix1, pix2, line_size, h); |
1097 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); | |
1098 | |
1099 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight; | |
1100 else return score1 + ABS(score2)*8; | |
1101 } | |
1102 | |
1729 | 1103 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { |
1104 int tmp; | |
1105 | |
1106 assert( (((int)pix) & 7) == 0); | |
1107 assert((line_size &7) ==0); | |
1108 | |
1109 #define SUM(in0, in1, out0, out1) \ | |
1110 "movq (%0), %%mm2\n"\ | |
1111 "movq 8(%0), %%mm3\n"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1112 "add %2,%0\n"\ |
1729 | 1113 "movq %%mm2, " #out0 "\n"\ |
1114 "movq %%mm3, " #out1 "\n"\ | |
1115 "psubusb " #in0 ", %%mm2\n"\ | |
1116 "psubusb " #in1 ", %%mm3\n"\ | |
1117 "psubusb " #out0 ", " #in0 "\n"\ | |
1118 "psubusb " #out1 ", " #in1 "\n"\ | |
1119 "por %%mm2, " #in0 "\n"\ | |
1120 "por %%mm3, " #in1 "\n"\ | |
1121 "movq " #in0 ", %%mm2\n"\ | |
1122 "movq " #in1 ", %%mm3\n"\ | |
1123 "punpcklbw %%mm7, " #in0 "\n"\ | |
1124 "punpcklbw %%mm7, " #in1 "\n"\ | |
1125 "punpckhbw %%mm7, %%mm2\n"\ | |
1126 "punpckhbw %%mm7, %%mm3\n"\ | |
1127 "paddw " #in1 ", " #in0 "\n"\ | |
1128 "paddw %%mm3, %%mm2\n"\ | |
1129 "paddw %%mm2, " #in0 "\n"\ | |
1130 "paddw " #in0 ", %%mm6\n" | |
1131 | |
1132 | |
1133 asm volatile ( | |
1134 "movl %3,%%ecx\n" | |
1135 "pxor %%mm6,%%mm6\n" | |
1136 "pxor %%mm7,%%mm7\n" | |
1137 "movq (%0),%%mm0\n" | |
1138 "movq 8(%0),%%mm1\n" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1139 "add %2,%0\n" |
1729 | 1140 "subl $2, %%ecx\n" |
1141 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1142 "1:\n" | |
1143 | |
1144 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
1145 | |
1146 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1147 | |
1148 "subl $2, %%ecx\n" | |
1149 "jnz 1b\n" | |
1150 | |
1151 "movq %%mm6,%%mm0\n" | |
1152 "psrlq $32, %%mm6\n" | |
1153 "paddw %%mm6,%%mm0\n" | |
1154 "movq %%mm0,%%mm6\n" | |
1155 "psrlq $16, %%mm0\n" | |
1156 "paddw %%mm6,%%mm0\n" | |
1157 "movd %%mm0,%1\n" | |
1158 : "+r" (pix), "=r"(tmp) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1159 : "r" ((long)line_size) , "m" (h) |
1729 | 1160 : "%ecx"); |
1161 return tmp & 0xFFFF; | |
1162 } | |
1163 #undef SUM | |
1164 | |
1165 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { | |
1166 int tmp; | |
1167 | |
1168 assert( (((int)pix) & 7) == 0); | |
1169 assert((line_size &7) ==0); | |
1170 | |
1171 #define SUM(in0, in1, out0, out1) \ | |
1172 "movq (%0), " #out0 "\n"\ | |
1173 "movq 8(%0), " #out1 "\n"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1174 "add %2,%0\n"\ |
1729 | 1175 "psadbw " #out0 ", " #in0 "\n"\ |
1176 "psadbw " #out1 ", " #in1 "\n"\ | |
1177 "paddw " #in1 ", " #in0 "\n"\ | |
1178 "paddw " #in0 ", %%mm6\n" | |
1179 | |
1180 asm volatile ( | |
1181 "movl %3,%%ecx\n" | |
1182 "pxor %%mm6,%%mm6\n" | |
1183 "pxor %%mm7,%%mm7\n" | |
1184 "movq (%0),%%mm0\n" | |
1185 "movq 8(%0),%%mm1\n" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1186 "add %2,%0\n" |
1729 | 1187 "subl $2, %%ecx\n" |
1188 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1189 "1:\n" | |
1190 | |
1191 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
1192 | |
1193 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1194 | |
1195 "subl $2, %%ecx\n" | |
1196 "jnz 1b\n" | |
1197 | |
1198 "movd %%mm6,%1\n" | |
1199 : "+r" (pix), "=r"(tmp) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1200 : "r" ((long)line_size) , "m" (h) |
1729 | 1201 : "%ecx"); |
1202 return tmp; | |
1203 } | |
1204 #undef SUM | |
1205 | |
1206 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
1207 int tmp; | |
1208 | |
1209 assert( (((int)pix1) & 7) == 0); | |
1210 assert( (((int)pix2) & 7) == 0); | |
1211 assert((line_size &7) ==0); | |
1212 | |
1213 #define SUM(in0, in1, out0, out1) \ | |
1214 "movq (%0),%%mm2\n"\ | |
1215 "movq (%1)," #out0 "\n"\ | |
1216 "movq 8(%0),%%mm3\n"\ | |
1217 "movq 8(%1)," #out1 "\n"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1218 "add %3,%0\n"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1219 "add %3,%1\n"\ |
1729 | 1220 "psubb " #out0 ", %%mm2\n"\ |
1221 "psubb " #out1 ", %%mm3\n"\ | |
1222 "pxor %%mm7, %%mm2\n"\ | |
1223 "pxor %%mm7, %%mm3\n"\ | |
1224 "movq %%mm2, " #out0 "\n"\ | |
1225 "movq %%mm3, " #out1 "\n"\ | |
1226 "psubusb " #in0 ", %%mm2\n"\ | |
1227 "psubusb " #in1 ", %%mm3\n"\ | |
1228 "psubusb " #out0 ", " #in0 "\n"\ | |
1229 "psubusb " #out1 ", " #in1 "\n"\ | |
1230 "por %%mm2, " #in0 "\n"\ | |
1231 "por %%mm3, " #in1 "\n"\ | |
1232 "movq " #in0 ", %%mm2\n"\ | |
1233 "movq " #in1 ", %%mm3\n"\ | |
1234 "punpcklbw %%mm7, " #in0 "\n"\ | |
1235 "punpcklbw %%mm7, " #in1 "\n"\ | |
1236 "punpckhbw %%mm7, %%mm2\n"\ | |
1237 "punpckhbw %%mm7, %%mm3\n"\ | |
1238 "paddw " #in1 ", " #in0 "\n"\ | |
1239 "paddw %%mm3, %%mm2\n"\ | |
1240 "paddw %%mm2, " #in0 "\n"\ | |
1241 "paddw " #in0 ", %%mm6\n" | |
1242 | |
1243 | |
1244 asm volatile ( | |
1245 "movl %4,%%ecx\n" | |
1246 "pxor %%mm6,%%mm6\n" | |
1247 "pcmpeqw %%mm7,%%mm7\n" | |
1248 "psllw $15, %%mm7\n" | |
1249 "packsswb %%mm7, %%mm7\n" | |
1250 "movq (%0),%%mm0\n" | |
1251 "movq (%1),%%mm2\n" | |
1252 "movq 8(%0),%%mm1\n" | |
1253 "movq 8(%1),%%mm3\n" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1254 "add %3,%0\n" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1255 "add %3,%1\n" |
1729 | 1256 "subl $2, %%ecx\n" |
1257 "psubb %%mm2, %%mm0\n" | |
1258 "psubb %%mm3, %%mm1\n" | |
1259 "pxor %%mm7, %%mm0\n" | |
1260 "pxor %%mm7, %%mm1\n" | |
1261 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1262 "1:\n" | |
1263 | |
1264 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
1265 | |
1266 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1267 | |
1268 "subl $2, %%ecx\n" | |
1269 "jnz 1b\n" | |
1270 | |
1271 "movq %%mm6,%%mm0\n" | |
1272 "psrlq $32, %%mm6\n" | |
1273 "paddw %%mm6,%%mm0\n" | |
1274 "movq %%mm0,%%mm6\n" | |
1275 "psrlq $16, %%mm0\n" | |
1276 "paddw %%mm6,%%mm0\n" | |
1277 "movd %%mm0,%2\n" | |
1278 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1279 : "r" ((long)line_size) , "m" (h) |
1729 | 1280 : "%ecx"); |
1281 return tmp & 0x7FFF; | |
1282 } | |
1283 #undef SUM | |
1284 | |
1285 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
1286 int tmp; | |
1287 | |
1288 assert( (((int)pix1) & 7) == 0); | |
1289 assert( (((int)pix2) & 7) == 0); | |
1290 assert((line_size &7) ==0); | |
1291 | |
1292 #define SUM(in0, in1, out0, out1) \ | |
1293 "movq (%0)," #out0 "\n"\ | |
1294 "movq (%1),%%mm2\n"\ | |
1295 "movq 8(%0)," #out1 "\n"\ | |
1296 "movq 8(%1),%%mm3\n"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1297 "add %3,%0\n"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1298 "add %3,%1\n"\ |
1729 | 1299 "psubb %%mm2, " #out0 "\n"\ |
1300 "psubb %%mm3, " #out1 "\n"\ | |
1301 "pxor %%mm7, " #out0 "\n"\ | |
1302 "pxor %%mm7, " #out1 "\n"\ | |
1303 "psadbw " #out0 ", " #in0 "\n"\ | |
1304 "psadbw " #out1 ", " #in1 "\n"\ | |
1305 "paddw " #in1 ", " #in0 "\n"\ | |
1306 "paddw " #in0 ", %%mm6\n" | |
1307 | |
1308 asm volatile ( | |
1309 "movl %4,%%ecx\n" | |
1310 "pxor %%mm6,%%mm6\n" | |
1311 "pcmpeqw %%mm7,%%mm7\n" | |
1312 "psllw $15, %%mm7\n" | |
1313 "packsswb %%mm7, %%mm7\n" | |
1314 "movq (%0),%%mm0\n" | |
1315 "movq (%1),%%mm2\n" | |
1316 "movq 8(%0),%%mm1\n" | |
1317 "movq 8(%1),%%mm3\n" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1318 "add %3,%0\n" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1319 "add %3,%1\n" |
1729 | 1320 "subl $2, %%ecx\n" |
1321 "psubb %%mm2, %%mm0\n" | |
1322 "psubb %%mm3, %%mm1\n" | |
1323 "pxor %%mm7, %%mm0\n" | |
1324 "pxor %%mm7, %%mm1\n" | |
1325 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1326 "1:\n" | |
1327 | |
1328 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
1329 | |
1330 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1331 | |
1332 "subl $2, %%ecx\n" | |
1333 "jnz 1b\n" | |
1334 | |
1335 "movd %%mm6,%2\n" | |
1336 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1337 : "r" ((long)line_size) , "m" (h) |
1729 | 1338 : "%ecx"); |
1339 return tmp; | |
1340 } | |
1341 #undef SUM | |
1342 | |
866 | 1343 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1344 long i=0; |
866 | 1345 asm volatile( |
1346 "1: \n\t" | |
1347 "movq (%2, %0), %%mm0 \n\t" | |
1348 "movq (%1, %0), %%mm1 \n\t" | |
1349 "psubb %%mm0, %%mm1 \n\t" | |
1350 "movq %%mm1, (%3, %0) \n\t" | |
1351 "movq 8(%2, %0), %%mm0 \n\t" | |
1352 "movq 8(%1, %0), %%mm1 \n\t" | |
1353 "psubb %%mm0, %%mm1 \n\t" | |
1354 "movq %%mm1, 8(%3, %0) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1355 "add $16, %0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1356 "cmp %4, %0 \n\t" |
866 | 1357 " jb 1b \n\t" |
1358 : "+r" (i) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1359 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15) |
866 | 1360 ); |
1361 for(; i<w; i++) | |
1362 dst[i+0] = src1[i+0]-src2[i+0]; | |
1363 } | |
1527 | 1364 |
1365 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1366 long i=0; |
1527 | 1367 uint8_t l, lt; |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
1368 |
1527 | 1369 asm volatile( |
1370 "1: \n\t" | |
1371 "movq -1(%1, %0), %%mm0 \n\t" // LT | |
1372 "movq (%1, %0), %%mm1 \n\t" // T | |
1373 "movq -1(%2, %0), %%mm2 \n\t" // L | |
1374 "movq (%2, %0), %%mm3 \n\t" // X | |
1375 "movq %%mm2, %%mm4 \n\t" // L | |
1376 "psubb %%mm0, %%mm2 \n\t" | |
1377 "paddb %%mm1, %%mm2 \n\t" // L + T - LT | |
1378 "movq %%mm4, %%mm5 \n\t" // L | |
1379 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) | |
1380 "pminub %%mm5, %%mm1 \n\t" // min(T, L) | |
1381 "pminub %%mm2, %%mm4 \n\t" | |
1382 "pmaxub %%mm1, %%mm4 \n\t" | |
1383 "psubb %%mm4, %%mm3 \n\t" // dst - pred | |
1384 "movq %%mm3, (%3, %0) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1385 "add $8, %0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1386 "cmp %4, %0 \n\t" |
1527 | 1387 " jb 1b \n\t" |
1388 : "+r" (i) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1389 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w) |
1527 | 1390 ); |
1391 | |
1392 l= *left; | |
1393 lt= *left_top; | |
1394 | |
1395 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); | |
1396 | |
1397 *left_top= src1[w-1]; | |
1398 *left = src2[w-1]; | |
1399 } | |
1400 | |
1153 | 1401 #define LBUTTERFLY2(a1,b1,a2,b2)\ |
1402 "paddw " #b1 ", " #a1 " \n\t"\ | |
1403 "paddw " #b2 ", " #a2 " \n\t"\ | |
1404 "paddw " #b1 ", " #b1 " \n\t"\ | |
1405 "paddw " #b2 ", " #b2 " \n\t"\ | |
1406 "psubw " #a1 ", " #b1 " \n\t"\ | |
1186 | 1407 "psubw " #a2 ", " #b2 " \n\t" |
866 | 1408 |
936 | 1409 #define HADAMARD48\ |
1153 | 1410 LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\ |
1411 LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\ | |
1412 LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\ | |
1413 LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\ | |
1414 LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\ | |
1415 LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\ | |
936 | 1416 |
1417 #define MMABS(a,z)\ | |
1418 "pxor " #z ", " #z " \n\t"\ | |
1419 "pcmpgtw " #a ", " #z " \n\t"\ | |
1420 "pxor " #z ", " #a " \n\t"\ | |
1421 "psubw " #z ", " #a " \n\t" | |
1422 | |
1423 #define MMABS_SUM(a,z, sum)\ | |
1424 "pxor " #z ", " #z " \n\t"\ | |
1425 "pcmpgtw " #a ", " #z " \n\t"\ | |
1426 "pxor " #z ", " #a " \n\t"\ | |
1427 "psubw " #z ", " #a " \n\t"\ | |
1428 "paddusw " #a ", " #sum " \n\t" | |
1429 | |
1153 | 1430 #define MMABS_MMX2(a,z)\ |
1431 "pxor " #z ", " #z " \n\t"\ | |
1432 "psubw " #a ", " #z " \n\t"\ | |
1433 "pmaxsw " #z ", " #a " \n\t" | |
1434 | |
1435 #define MMABS_SUM_MMX2(a,z, sum)\ | |
1436 "pxor " #z ", " #z " \n\t"\ | |
1437 "psubw " #a ", " #z " \n\t"\ | |
1438 "pmaxsw " #z ", " #a " \n\t"\ | |
1439 "paddusw " #a ", " #sum " \n\t" | |
1440 | |
936 | 1441 #define SBUTTERFLY(a,b,t,n)\ |
1442 "movq " #a ", " #t " \n\t" /* abcd */\ | |
1443 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ | |
1444 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ | |
1153 | 1445 |
936 | 1446 #define TRANSPOSE4(a,b,c,d,t)\ |
1447 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\ | |
1448 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\ | |
1449 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\ | |
1450 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */ | |
1451 | |
1452 #define LOAD4(o, a, b, c, d)\ | |
1453 "movq "#o"(%1), " #a " \n\t"\ | |
1454 "movq "#o"+16(%1), " #b " \n\t"\ | |
1455 "movq "#o"+32(%1), " #c " \n\t"\ | |
1456 "movq "#o"+48(%1), " #d " \n\t" | |
1457 | |
1458 #define STORE4(o, a, b, c, d)\ | |
1459 "movq "#a", "#o"(%1) \n\t"\ | |
1460 "movq "#b", "#o"+16(%1) \n\t"\ | |
1461 "movq "#c", "#o"+32(%1) \n\t"\ | |
1462 "movq "#d", "#o"+48(%1) \n\t"\ | |
1463 | |
1708 | 1464 static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ |
936 | 1465 uint64_t temp[16] __align8; |
1466 int sum=0; | |
1708 | 1467 |
1468 assert(h==8); | |
936 | 1469 |
1470 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); | |
1471 | |
1472 asm volatile( | |
1473 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) | |
1474 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) | |
1475 | |
1476 HADAMARD48 | |
1477 | |
1478 "movq %%mm7, 112(%1) \n\t" | |
1479 | |
1480 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) | |
1481 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) | |
1482 | |
1483 "movq 112(%1), %%mm7 \n\t" | |
1484 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) | |
1485 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) | |
1486 | |
1487 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) | |
1488 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) | |
1489 | |
1490 HADAMARD48 | |
1491 | |
1492 "movq %%mm7, 120(%1) \n\t" | |
1493 | |
1494 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) | |
1495 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2) | |
1496 | |
1497 "movq 120(%1), %%mm7 \n\t" | |
1498 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) | |
1499 "movq %%mm7, %%mm5 \n\t"//FIXME remove | |
1500 "movq %%mm6, %%mm7 \n\t" | |
1501 "movq %%mm0, %%mm6 \n\t" | |
1502 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove | |
1503 | |
1504 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3) | |
1505 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) | |
1506 | |
1507 HADAMARD48 | |
1508 "movq %%mm7, 64(%1) \n\t" | |
1509 MMABS(%%mm0, %%mm7) | |
1510 MMABS_SUM(%%mm1, %%mm7, %%mm0) | |
1511 MMABS_SUM(%%mm2, %%mm7, %%mm0) | |
1512 MMABS_SUM(%%mm3, %%mm7, %%mm0) | |
1513 MMABS_SUM(%%mm4, %%mm7, %%mm0) | |
1514 MMABS_SUM(%%mm5, %%mm7, %%mm0) | |
1515 MMABS_SUM(%%mm6, %%mm7, %%mm0) | |
1516 "movq 64(%1), %%mm1 \n\t" | |
1517 MMABS_SUM(%%mm1, %%mm7, %%mm0) | |
1518 "movq %%mm0, 64(%1) \n\t" | |
1519 | |
1520 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) | |
1521 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7) | |
1522 | |
1523 HADAMARD48 | |
1524 "movq %%mm7, (%1) \n\t" | |
1525 MMABS(%%mm0, %%mm7) | |
1526 MMABS_SUM(%%mm1, %%mm7, %%mm0) | |
1527 MMABS_SUM(%%mm2, %%mm7, %%mm0) | |
1528 MMABS_SUM(%%mm3, %%mm7, %%mm0) | |
1529 MMABS_SUM(%%mm4, %%mm7, %%mm0) | |
1530 MMABS_SUM(%%mm5, %%mm7, %%mm0) | |
1531 MMABS_SUM(%%mm6, %%mm7, %%mm0) | |
1532 "movq (%1), %%mm1 \n\t" | |
1533 MMABS_SUM(%%mm1, %%mm7, %%mm0) | |
1534 "movq 64(%1), %%mm1 \n\t" | |
1535 MMABS_SUM(%%mm1, %%mm7, %%mm0) | |
1536 | |
1537 "movq %%mm0, %%mm1 \n\t" | |
1538 "psrlq $32, %%mm0 \n\t" | |
1539 "paddusw %%mm1, %%mm0 \n\t" | |
1540 "movq %%mm0, %%mm1 \n\t" | |
1541 "psrlq $16, %%mm0 \n\t" | |
1542 "paddusw %%mm1, %%mm0 \n\t" | |
1543 "movd %%mm0, %0 \n\t" | |
1544 | |
1545 : "=r" (sum) | |
1546 : "r"(temp) | |
1547 ); | |
1548 return sum&0xFFFF; | |
1549 } | |
1550 | |
1708 | 1551 static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ |
1153 | 1552 uint64_t temp[16] __align8; |
1553 int sum=0; | |
1708 | 1554 |
1555 assert(h==8); | |
1153 | 1556 |
1557 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); | |
1558 | |
1559 asm volatile( | |
1560 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) | |
1561 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) | |
1562 | |
1563 HADAMARD48 | |
1564 | |
1565 "movq %%mm7, 112(%1) \n\t" | |
1566 | |
1567 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) | |
1568 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) | |
1569 | |
1570 "movq 112(%1), %%mm7 \n\t" | |
1571 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) | |
1572 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) | |
1573 | |
1574 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) | |
1575 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) | |
1576 | |
1577 HADAMARD48 | |
1578 | |
1579 "movq %%mm7, 120(%1) \n\t" | |
1580 | |
1581 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) | |
1582 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2) | |
1583 | |
1584 "movq 120(%1), %%mm7 \n\t" | |
1585 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) | |
1586 "movq %%mm7, %%mm5 \n\t"//FIXME remove | |
1587 "movq %%mm6, %%mm7 \n\t" | |
1588 "movq %%mm0, %%mm6 \n\t" | |
1589 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove | |
1590 | |
1591 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3) | |
1592 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) | |
1593 | |
1594 HADAMARD48 | |
1595 "movq %%mm7, 64(%1) \n\t" | |
1596 MMABS_MMX2(%%mm0, %%mm7) | |
1597 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) | |
1598 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0) | |
1599 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0) | |
1600 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0) | |
1601 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0) | |
1602 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0) | |
1603 "movq 64(%1), %%mm1 \n\t" | |
1604 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) | |
1605 "movq %%mm0, 64(%1) \n\t" | |
1606 | |
1607 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) | |
1608 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7) | |
1609 | |
1610 HADAMARD48 | |
1611 "movq %%mm7, (%1) \n\t" | |
1612 MMABS_MMX2(%%mm0, %%mm7) | |
1613 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) | |
1614 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0) | |
1615 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0) | |
1616 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0) | |
1617 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0) | |
1618 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0) | |
1619 "movq (%1), %%mm1 \n\t" | |
1620 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) | |
1621 "movq 64(%1), %%mm1 \n\t" | |
1622 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) | |
1623 | |
2892
41315d0120b3
replace a few mov + psrlq with pshufw, there are more cases which could benefit from this but they would require us to duplicate some functions ...
michael
parents:
2871
diff
changeset
|
1624 "pshufw $0x0E, %%mm0, %%mm1 \n\t" |
1153 | 1625 "paddusw %%mm1, %%mm0 \n\t" |
2892
41315d0120b3
replace a few mov + psrlq with pshufw, there are more cases which could benefit from this but they would require us to duplicate some functions ...
michael
parents:
2871
diff
changeset
|
1626 "pshufw $0x01, %%mm0, %%mm1 \n\t" |
1153 | 1627 "paddusw %%mm1, %%mm0 \n\t" |
1628 "movd %%mm0, %0 \n\t" | |
1629 | |
1630 : "=r" (sum) | |
1631 : "r"(temp) | |
1632 ); | |
1633 return sum&0xFFFF; | |
1634 } | |
1635 | |
1636 | |
1708 | 1637 WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx) |
1638 WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2) | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
1639 #endif //CONFIG_ENCODERS |
866 | 1640 |
959 | 1641 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d) |
1642 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d) | |
1643 | |
954 | 1644 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ |
1645 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ | |
961 | 1646 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ |
954 | 1647 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ |
1648 "movq "#in7", " #m3 " \n\t" /* d */\ | |
1649 "movq "#in0", %%mm5 \n\t" /* D */\ | |
1650 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\ | |
1651 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\ | |
1652 "movq "#in1", %%mm5 \n\t" /* C */\ | |
1653 "movq "#in2", %%mm6 \n\t" /* B */\ | |
1654 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\ | |
1655 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ | |
1656 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ | |
1657 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ | |
961 | 1658 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ |
954 | 1659 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ |
1660 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ | |
1661 "psraw $5, %%mm5 \n\t"\ | |
1662 "packuswb %%mm5, %%mm5 \n\t"\ | |
1663 OP(%%mm5, out, %%mm7, d) | |
1664 | |
959 | 1665 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\ |
1057 | 1666 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
954 | 1667 uint64_t temp;\ |
1668 \ | |
1669 asm volatile(\ | |
1670 "pxor %%mm7, %%mm7 \n\t"\ | |
1671 "1: \n\t"\ | |
1672 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ | |
1673 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ | |
1674 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ | |
1675 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ | |
1676 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ | |
1677 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ | |
1678 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ | |
1679 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ | |
1680 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ | |
1681 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ | |
1682 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ | |
1683 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ | |
1684 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ | |
1685 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ | |
1686 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ | |
1687 "paddw %%mm3, %%mm5 \n\t" /* b */\ | |
1688 "paddw %%mm2, %%mm6 \n\t" /* c */\ | |
1689 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
1690 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ | |
1691 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ | |
961 | 1692 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ |
954 | 1693 "paddw %%mm4, %%mm0 \n\t" /* a */\ |
1694 "paddw %%mm1, %%mm5 \n\t" /* d */\ | |
961 | 1695 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ |
954 | 1696 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ |
961 | 1697 "paddw %6, %%mm6 \n\t"\ |
954 | 1698 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
1699 "psraw $5, %%mm0 \n\t"\ | |
961 | 1700 "movq %%mm0, %5 \n\t"\ |
954 | 1701 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ |
1702 \ | |
1703 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ | |
1704 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\ | |
1705 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\ | |
1706 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\ | |
1707 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\ | |
1708 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\ | |
1709 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\ | |
1710 "paddw %%mm0, %%mm2 \n\t" /* b */\ | |
1711 "paddw %%mm5, %%mm3 \n\t" /* c */\ | |
1712 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ | |
1713 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ | |
1714 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\ | |
1715 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ | |
1716 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ | |
1717 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ | |
961 | 1718 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ |
954 | 1719 "paddw %%mm2, %%mm1 \n\t" /* a */\ |
1720 "paddw %%mm6, %%mm4 \n\t" /* d */\ | |
961 | 1721 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ |
954 | 1722 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ |
961 | 1723 "paddw %6, %%mm1 \n\t"\ |
954 | 1724 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ |
1725 "psraw $5, %%mm3 \n\t"\ | |
961 | 1726 "movq %5, %%mm1 \n\t"\ |
954 | 1727 "packuswb %%mm3, %%mm1 \n\t"\ |
959 | 1728 OP_MMX2(%%mm1, (%1),%%mm4, q)\ |
954 | 1729 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ |
1730 \ | |
1731 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ | |
1732 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ | |
1733 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ | |
1734 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\ | |
1735 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\ | |
1736 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\ | |
1737 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\ | |
1738 "paddw %%mm1, %%mm5 \n\t" /* b */\ | |
1739 "paddw %%mm4, %%mm0 \n\t" /* c */\ | |
1740 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
1741 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ | |
1742 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ | |
1743 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ | |
961 | 1744 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ |
954 | 1745 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ |
1746 "paddw %%mm3, %%mm2 \n\t" /* d */\ | |
1747 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ | |
1748 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\ | |
1749 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ | |
1750 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ | |
1751 "paddw %%mm2, %%mm6 \n\t" /* a */\ | |
961 | 1752 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ |
1753 "paddw %6, %%mm0 \n\t"\ | |
954 | 1754 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
1755 "psraw $5, %%mm0 \n\t"\ | |
1756 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ | |
1757 \ | |
1758 "paddw %%mm5, %%mm3 \n\t" /* a */\ | |
1759 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\ | |
1760 "paddw %%mm4, %%mm6 \n\t" /* b */\ | |
1761 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\ | |
1762 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\ | |
1763 "paddw %%mm1, %%mm4 \n\t" /* c */\ | |
1764 "paddw %%mm2, %%mm5 \n\t" /* d */\ | |
1765 "paddw %%mm6, %%mm6 \n\t" /* 2b */\ | |
1766 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ | |
961 | 1767 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ |
1768 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ | |
954 | 1769 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ |
961 | 1770 "paddw %6, %%mm4 \n\t"\ |
954 | 1771 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ |
1772 "psraw $5, %%mm4 \n\t"\ | |
1773 "packuswb %%mm4, %%mm0 \n\t"\ | |
959 | 1774 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ |
954 | 1775 \ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1776 "add %3, %0 \n\t"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1777 "add %4, %1 \n\t"\ |
954 | 1778 "decl %2 \n\t"\ |
1779 " jnz 1b \n\t"\ | |
967 | 1780 : "+a"(src), "+c"(dst), "+m"(h)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1781 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
966 | 1782 : "memory"\ |
954 | 1783 );\ |
1784 }\ | |
1785 \ | |
1786 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
1787 int i;\ | |
1788 int16_t temp[16];\ | |
1789 /* quick HACK, XXX FIXME MUST be optimized */\ | |
1790 for(i=0; i<h; i++)\ | |
1791 {\ | |
1792 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ | |
1793 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ | |
1794 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ | |
1795 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ | |
1796 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ | |
1797 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\ | |
1798 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\ | |
1799 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\ | |
1800 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\ | |
1801 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\ | |
1802 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\ | |
1803 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\ | |
1804 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\ | |
1805 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\ | |
1806 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\ | |
1807 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\ | |
1808 asm volatile(\ | |
1809 "movq (%0), %%mm0 \n\t"\ | |
1810 "movq 8(%0), %%mm1 \n\t"\ | |
1811 "paddw %2, %%mm0 \n\t"\ | |
1812 "paddw %2, %%mm1 \n\t"\ | |
1813 "psraw $5, %%mm0 \n\t"\ | |
1814 "psraw $5, %%mm1 \n\t"\ | |
1815 "packuswb %%mm1, %%mm0 \n\t"\ | |
959 | 1816 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ |
954 | 1817 "movq 16(%0), %%mm0 \n\t"\ |
1818 "movq 24(%0), %%mm1 \n\t"\ | |
1819 "paddw %2, %%mm0 \n\t"\ | |
1820 "paddw %2, %%mm1 \n\t"\ | |
1821 "psraw $5, %%mm0 \n\t"\ | |
1822 "psraw $5, %%mm1 \n\t"\ | |
1823 "packuswb %%mm1, %%mm0 \n\t"\ | |
959 | 1824 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\ |
954 | 1825 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ |
966 | 1826 : "memory"\ |
954 | 1827 );\ |
1828 dst+=dstStride;\ | |
1829 src+=srcStride;\ | |
1830 }\ | |
1831 }\ | |
1832 \ | |
1057 | 1833 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
959 | 1834 uint64_t temp;\ |
1835 \ | |
1836 asm volatile(\ | |
1837 "pxor %%mm7, %%mm7 \n\t"\ | |
1838 "1: \n\t"\ | |
1839 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ | |
1840 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ | |
1841 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ | |
1842 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ | |
1843 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ | |
1844 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ | |
1845 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ | |
1846 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ | |
1847 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ | |
1848 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ | |
1849 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ | |
1850 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ | |
1851 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ | |
1852 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ | |
1853 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ | |
1854 "paddw %%mm3, %%mm5 \n\t" /* b */\ | |
1855 "paddw %%mm2, %%mm6 \n\t" /* c */\ | |
1856 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
1857 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ | |
1858 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ | |
961 | 1859 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ |
959 | 1860 "paddw %%mm4, %%mm0 \n\t" /* a */\ |
1861 "paddw %%mm1, %%mm5 \n\t" /* d */\ | |
961 | 1862 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ |
959 | 1863 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ |
961 | 1864 "paddw %6, %%mm6 \n\t"\ |
959 | 1865 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
1866 "psraw $5, %%mm0 \n\t"\ | |
1867 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ | |
1868 \ | |
1869 "movd 5(%0), %%mm5 \n\t" /* FGHI */\ | |
1870 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\ | |
1871 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\ | |
1872 "paddw %%mm5, %%mm1 \n\t" /* a */\ | |
1873 "paddw %%mm6, %%mm2 \n\t" /* b */\ | |
1874 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\ | |
1875 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\ | |
1876 "paddw %%mm6, %%mm3 \n\t" /* c */\ | |
1877 "paddw %%mm5, %%mm4 \n\t" /* d */\ | |
1878 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ | |
1879 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ | |
961 | 1880 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ |
1881 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ | |
959 | 1882 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ |
961 | 1883 "paddw %6, %%mm1 \n\t"\ |
959 | 1884 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ |
1885 "psraw $5, %%mm3 \n\t"\ | |
1886 "packuswb %%mm3, %%mm0 \n\t"\ | |
1887 OP_MMX2(%%mm0, (%1), %%mm4, q)\ | |
1888 \ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1889 "add %3, %0 \n\t"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1890 "add %4, %1 \n\t"\ |
959 | 1891 "decl %2 \n\t"\ |
961 | 1892 " jnz 1b \n\t"\ |
967 | 1893 : "+a"(src), "+c"(dst), "+m"(h)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1894 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
966 | 1895 : "memory"\ |
959 | 1896 );\ |
1897 }\ | |
1898 \ | |
1899 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
1900 int i;\ | |
1901 int16_t temp[8];\ | |
1902 /* quick HACK, XXX FIXME MUST be optimized */\ | |
1903 for(i=0; i<h; i++)\ | |
1904 {\ | |
1905 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ | |
1906 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ | |
1907 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ | |
1908 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ | |
1909 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ | |
1910 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\ | |
1911 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\ | |
1912 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\ | |
1913 asm volatile(\ | |
1914 "movq (%0), %%mm0 \n\t"\ | |
1915 "movq 8(%0), %%mm1 \n\t"\ | |
1916 "paddw %2, %%mm0 \n\t"\ | |
1917 "paddw %2, %%mm1 \n\t"\ | |
1918 "psraw $5, %%mm0 \n\t"\ | |
1919 "psraw $5, %%mm1 \n\t"\ | |
1920 "packuswb %%mm1, %%mm0 \n\t"\ | |
1921 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ | |
1922 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ | |
966 | 1923 :"memory"\ |
959 | 1924 );\ |
1925 dst+=dstStride;\ | |
1926 src+=srcStride;\ | |
1927 }\ | |
1928 } | |
1929 | |
1930 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ | |
1931 \ | |
1932 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
954 | 1933 uint64_t temp[17*4];\ |
1934 uint64_t *temp_ptr= temp;\ | |
1935 int count= 17;\ | |
1936 \ | |
1937 /*FIXME unroll */\ | |
1938 asm volatile(\ | |
1939 "pxor %%mm7, %%mm7 \n\t"\ | |
1940 "1: \n\t"\ | |
1941 "movq (%0), %%mm0 \n\t"\ | |
1942 "movq (%0), %%mm1 \n\t"\ | |
1943 "movq 8(%0), %%mm2 \n\t"\ | |
1944 "movq 8(%0), %%mm3 \n\t"\ | |
1945 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1946 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
1947 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1948 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
1949 "movq %%mm0, (%1) \n\t"\ | |
1950 "movq %%mm1, 17*8(%1) \n\t"\ | |
967 | 1951 "movq %%mm2, 2*17*8(%1) \n\t"\ |
1952 "movq %%mm3, 3*17*8(%1) \n\t"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1953 "add $8, %1 \n\t"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1954 "add %3, %0 \n\t"\ |
954 | 1955 "decl %2 \n\t"\ |
1956 " jnz 1b \n\t"\ | |
1957 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1958 : "r" ((long)srcStride)\ |
966 | 1959 : "memory"\ |
954 | 1960 );\ |
1961 \ | |
1962 temp_ptr= temp;\ | |
1963 count=4;\ | |
1964 \ | |
1965 /*FIXME reorder for speed */\ | |
1966 asm volatile(\ | |
1967 /*"pxor %%mm7, %%mm7 \n\t"*/\ | |
1968 "1: \n\t"\ | |
1969 "movq (%0), %%mm0 \n\t"\ | |
1970 "movq 8(%0), %%mm1 \n\t"\ | |
1971 "movq 16(%0), %%mm2 \n\t"\ | |
1972 "movq 24(%0), %%mm3 \n\t"\ | |
961 | 1973 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
1974 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1975 "add %4, %1 \n\t"\ |
961 | 1976 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
954 | 1977 \ |
961 | 1978 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1979 "add %4, %1 \n\t"\ |
961 | 1980 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
1981 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1982 "add %4, %1 \n\t"\ |
961 | 1983 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ |
1984 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1985 "add %4, %1 \n\t"\ |
961 | 1986 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ |
1987 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1988 "add %4, %1 \n\t"\ |
961 | 1989 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ |
1990 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1991 "add %4, %1 \n\t"\ |
961 | 1992 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ |
954 | 1993 \ |
961 | 1994 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1995 "add %4, %1 \n\t" \ |
961 | 1996 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ |
1997 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ | |
954 | 1998 \ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1999 "add $136, %0 \n\t"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2000 "add %6, %1 \n\t"\ |
954 | 2001 "decl %2 \n\t"\ |
2002 " jnz 1b \n\t"\ | |
958
9bb668034ecf
slowdown / gcc 2.95.* bug workaround (this should be reversed as soon as gcc 2.95.* support is droped)
michaelni
parents:
954
diff
changeset
|
2003 \ |
967 | 2004 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2005 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\ |
966 | 2006 :"memory"\ |
954 | 2007 );\ |
2008 }\ | |
2009 \ | |
1057 | 2010 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2209 | 2011 uint64_t temp[9*2];\ |
954 | 2012 uint64_t *temp_ptr= temp;\ |
2013 int count= 9;\ | |
2014 \ | |
2015 /*FIXME unroll */\ | |
2016 asm volatile(\ | |
2017 "pxor %%mm7, %%mm7 \n\t"\ | |
2018 "1: \n\t"\ | |
2019 "movq (%0), %%mm0 \n\t"\ | |
2020 "movq (%0), %%mm1 \n\t"\ | |
2021 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2022 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
2023 "movq %%mm0, (%1) \n\t"\ | |
2024 "movq %%mm1, 9*8(%1) \n\t"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2025 "add $8, %1 \n\t"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2026 "add %3, %0 \n\t"\ |
954 | 2027 "decl %2 \n\t"\ |
2028 " jnz 1b \n\t"\ | |
2029 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2030 : "r" ((long)srcStride)\ |
966 | 2031 : "memory"\ |
954 | 2032 );\ |
2033 \ | |
2034 temp_ptr= temp;\ | |
2035 count=2;\ | |
2036 \ | |
2037 /*FIXME reorder for speed */\ | |
2038 asm volatile(\ | |
2039 /*"pxor %%mm7, %%mm7 \n\t"*/\ | |
2040 "1: \n\t"\ | |
2041 "movq (%0), %%mm0 \n\t"\ | |
2042 "movq 8(%0), %%mm1 \n\t"\ | |
2043 "movq 16(%0), %%mm2 \n\t"\ | |
2044 "movq 24(%0), %%mm3 \n\t"\ | |
961 | 2045 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
2046 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2047 "add %4, %1 \n\t"\ |
961 | 2048 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
954 | 2049 \ |
961 | 2050 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2051 "add %4, %1 \n\t"\ |
961 | 2052 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
954 | 2053 \ |
961 | 2054 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2055 "add %4, %1 \n\t"\ |
961 | 2056 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ |
2057 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ | |
954 | 2058 \ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2059 "add $72, %0 \n\t"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2060 "add %6, %1 \n\t"\ |
954 | 2061 "decl %2 \n\t"\ |
2062 " jnz 1b \n\t"\ | |
2063 \ | |
961 | 2064 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2065 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\ |
966 | 2066 : "memory"\ |
2067 );\ | |
959 | 2068 }\ |
954 | 2069 \ |
1064 | 2070 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ |
959 | 2071 OPNAME ## pixels8_mmx(dst, src, stride, 8);\ |
954 | 2072 }\ |
2073 \ | |
1064 | 2074 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2075 uint64_t temp[8];\ |
954 | 2076 uint8_t * const half= (uint8_t*)temp;\ |
2077 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2078 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ |
954 | 2079 }\ |
2080 \ | |
1064 | 2081 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2082 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ |
2083 }\ | |
2084 \ | |
1064 | 2085 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2086 uint64_t temp[8];\ |
954 | 2087 uint8_t * const half= (uint8_t*)temp;\ |
2088 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2089 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\ |
954 | 2090 }\ |
2091 \ | |
1064 | 2092 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2093 uint64_t temp[8];\ |
954 | 2094 uint8_t * const half= (uint8_t*)temp;\ |
959 | 2095 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2096 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ |
954 | 2097 }\ |
2098 \ | |
1064 | 2099 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
959 | 2100 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ |
954 | 2101 }\ |
2102 \ | |
1064 | 2103 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2104 uint64_t temp[8];\ |
954 | 2105 uint8_t * const half= (uint8_t*)temp;\ |
959 | 2106 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2107 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\ |
954 | 2108 }\ |
1064 | 2109 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2110 uint64_t half[8 + 9];\ |
2111 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
2112 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2113 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2114 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ |
959 | 2115 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2116 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ |
954 | 2117 }\ |
1064 | 2118 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2119 uint64_t half[8 + 9];\ |
2120 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
2121 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2122 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2123 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ |
959 | 2124 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2125 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ |
954 | 2126 }\ |
1064 | 2127 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2128 uint64_t half[8 + 9];\ |
2129 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
2130 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2131 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2132 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ |
959 | 2133 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2134 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ |
954 | 2135 }\ |
1064 | 2136 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2137 uint64_t half[8 + 9];\ |
2138 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
2139 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2140 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2141 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ |
959 | 2142 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2143 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ |
954 | 2144 }\ |
1064 | 2145 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2146 uint64_t half[8 + 9];\ |
954 | 2147 uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
2148 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2149 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
959 | 2150 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2151 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ |
954 | 2152 }\ |
1064 | 2153 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2154 uint64_t half[8 + 9];\ |
954 | 2155 uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
2156 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2157 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
959 | 2158 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2159 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ |
954 | 2160 }\ |
1064 | 2161 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2162 uint64_t half[8 + 9];\ |
2163 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 2164 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2165 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ |
984 | 2166 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
954 | 2167 }\ |
1064 | 2168 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2169 uint64_t half[8 + 9];\ |
2170 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 2171 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2172 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ |
984 | 2173 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
954 | 2174 }\ |
1064 | 2175 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2176 uint64_t half[9];\ |
954 | 2177 uint8_t * const halfH= ((uint8_t*)half);\ |
2178 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
959 | 2179 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
954 | 2180 }\ |
1064 | 2181 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ |
959 | 2182 OPNAME ## pixels16_mmx(dst, src, stride, 16);\ |
954 | 2183 }\ |
2184 \ | |
1064 | 2185 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2186 uint64_t temp[32];\ |
2187 uint8_t * const half= (uint8_t*)temp;\ | |
2188 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2189 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ |
954 | 2190 }\ |
2191 \ | |
1064 | 2192 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2193 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ |
2194 }\ | |
2195 \ | |
1064 | 2196 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2197 uint64_t temp[32];\ |
2198 uint8_t * const half= (uint8_t*)temp;\ | |
2199 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2200 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\ |
954 | 2201 }\ |
2202 \ | |
1064 | 2203 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2204 uint64_t temp[32];\ |
2205 uint8_t * const half= (uint8_t*)temp;\ | |
959 | 2206 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2207 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ |
954 | 2208 }\ |
2209 \ | |
1064 | 2210 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
959 | 2211 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ |
954 | 2212 }\ |
2213 \ | |
1064 | 2214 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2215 uint64_t temp[32];\ |
2216 uint8_t * const half= (uint8_t*)temp;\ | |
959 | 2217 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2218 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\ |
954 | 2219 }\ |
1064 | 2220 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2221 uint64_t half[16*2 + 17*2];\ |
2222 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2223 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2224 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2225 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ |
959 | 2226 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2227 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ |
954 | 2228 }\ |
1064 | 2229 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2230 uint64_t half[16*2 + 17*2];\ |
2231 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2232 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2233 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2234 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ |
959 | 2235 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2236 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ |
954 | 2237 }\ |
1064 | 2238 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2239 uint64_t half[16*2 + 17*2];\ |
2240 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2241 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2242 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2243 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ |
959 | 2244 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2245 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ |
954 | 2246 }\ |
1064 | 2247 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2248 uint64_t half[16*2 + 17*2];\ |
2249 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2250 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2251 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2252 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ |
959 | 2253 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2254 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ |
954 | 2255 }\ |
1064 | 2256 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2257 uint64_t half[16*2 + 17*2];\ |
2258 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2259 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2260 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
959 | 2261 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2262 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ |
954 | 2263 }\ |
1064 | 2264 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2265 uint64_t half[16*2 + 17*2];\ |
2266 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2267 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2268 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
959 | 2269 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2270 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ |
954 | 2271 }\ |
1064 | 2272 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2273 uint64_t half[17*2];\ |
2274 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 2275 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2276 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ |
984 | 2277 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
954 | 2278 }\ |
1064 | 2279 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2280 uint64_t half[17*2];\ |
2281 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 2282 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2283 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ |
984 | 2284 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
954 | 2285 }\ |
1064 | 2286 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2287 uint64_t half[17*2];\ |
2288 uint8_t * const halfH= ((uint8_t*)half);\ | |
2289 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
959 | 2290 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
954 | 2291 } |
2292 | |
2293 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" | |
959 | 2294 #define AVG_3DNOW_OP(a,b,temp, size) \ |
954 | 2295 "mov" #size " " #b ", " #temp " \n\t"\ |
2296 "pavgusb " #temp ", " #a " \n\t"\ | |
2297 "mov" #size " " #a ", " #b " \n\t" | |
959 | 2298 #define AVG_MMX2_OP(a,b,temp, size) \ |
954 | 2299 "mov" #size " " #b ", " #temp " \n\t"\ |
2300 "pavgb " #temp ", " #a " \n\t"\ | |
2301 "mov" #size " " #a ", " #b " \n\t" | |
959 | 2302 |
2303 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP) | |
2304 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP) | |
2305 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) | |
2306 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow) | |
2307 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) | |
2308 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) | |
954 | 2309 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) |
959 | 2310 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) |
954 | 2311 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) |
2312 | |
393 | 2313 #if 0 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2314 static void just_return() { return; } |
393 | 2315 #endif |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2316 |
954 | 2317 #define SET_QPEL_FUNC(postfix1, postfix2) \ |
2318 c->put_ ## postfix1 = put_ ## postfix2;\ | |
2319 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\ | |
2320 c->avg_ ## postfix1 = avg_ ## postfix2; | |
1092 | 2321 |
1784 | 2322 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2323 long i=0; |
1784 | 2324 |
2325 assert(ABS(scale) < 256); | |
2326 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; | |
2327 | |
2328 asm volatile( | |
2329 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w | |
2330 "psrlw $15, %%mm6 \n\t" // 1w | |
2331 "pxor %%mm7, %%mm7 \n\t" | |
2332 "movd %4, %%mm5 \n\t" | |
2333 "punpcklwd %%mm5, %%mm5 \n\t" | |
2334 "punpcklwd %%mm5, %%mm5 \n\t" | |
2335 "1: \n\t" | |
2336 "movq (%1, %0), %%mm0 \n\t" | |
2337 "movq 8(%1, %0), %%mm1 \n\t" | |
2338 "pmulhw %%mm5, %%mm0 \n\t" | |
2339 "pmulhw %%mm5, %%mm1 \n\t" | |
2340 "paddw %%mm6, %%mm0 \n\t" | |
2341 "paddw %%mm6, %%mm1 \n\t" | |
2342 "psraw $1, %%mm0 \n\t" | |
2343 "psraw $1, %%mm1 \n\t" | |
2344 "paddw (%2, %0), %%mm0 \n\t" | |
2345 "paddw 8(%2, %0), %%mm1 \n\t" | |
2346 "psraw $6, %%mm0 \n\t" | |
2347 "psraw $6, %%mm1 \n\t" | |
2348 "pmullw (%3, %0), %%mm0 \n\t" | |
2349 "pmullw 8(%3, %0), %%mm1 \n\t" | |
2350 "pmaddwd %%mm0, %%mm0 \n\t" | |
2351 "pmaddwd %%mm1, %%mm1 \n\t" | |
2352 "paddd %%mm1, %%mm0 \n\t" | |
2353 "psrld $4, %%mm0 \n\t" | |
2354 "paddd %%mm0, %%mm7 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2355 "add $16, %0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2356 "cmp $128, %0 \n\t" //FIXME optimize & bench |
1784 | 2357 " jb 1b \n\t" |
2358 "movq %%mm7, %%mm6 \n\t" | |
2359 "psrlq $32, %%mm7 \n\t" | |
2360 "paddd %%mm6, %%mm7 \n\t" | |
2361 "psrld $2, %%mm7 \n\t" | |
2362 "movd %%mm7, %0 \n\t" | |
2363 | |
2364 : "+r" (i) | |
2365 : "r"(basis), "r"(rem), "r"(weight), "g"(scale) | |
2366 ); | |
2367 return i; | |
2368 } | |
2369 | |
2370 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2371 long i=0; |
1784 | 2372 |
2373 if(ABS(scale) < 256){ | |
2374 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; | |
2375 asm volatile( | |
2376 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w | |
2377 "psrlw $15, %%mm6 \n\t" // 1w | |
2378 "movd %3, %%mm5 \n\t" | |
2379 "punpcklwd %%mm5, %%mm5 \n\t" | |
2380 "punpcklwd %%mm5, %%mm5 \n\t" | |
2381 "1: \n\t" | |
2382 "movq (%1, %0), %%mm0 \n\t" | |
2383 "movq 8(%1, %0), %%mm1 \n\t" | |
2384 "pmulhw %%mm5, %%mm0 \n\t" | |
2385 "pmulhw %%mm5, %%mm1 \n\t" | |
2386 "paddw %%mm6, %%mm0 \n\t" | |
2387 "paddw %%mm6, %%mm1 \n\t" | |
2388 "psraw $1, %%mm0 \n\t" | |
2389 "psraw $1, %%mm1 \n\t" | |
2390 "paddw (%2, %0), %%mm0 \n\t" | |
2391 "paddw 8(%2, %0), %%mm1 \n\t" | |
2392 "movq %%mm0, (%2, %0) \n\t" | |
2393 "movq %%mm1, 8(%2, %0) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2394 "add $16, %0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2395 "cmp $128, %0 \n\t" //FIXME optimize & bench |
1784 | 2396 " jb 1b \n\t" |
2397 | |
2398 : "+r" (i) | |
2399 : "r"(basis), "r"(rem), "g"(scale) | |
2400 ); | |
2401 }else{ | |
2402 for(i=0; i<8*8; i++){ | |
2403 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); | |
2404 } | |
2405 } | |
2406 } | |
2754 | 2407 |
2408 #include "h264dsp_mmx.c" | |
1784 | 2409 |
1092 | 2410 /* external functions, from idct_mmx.c */ |
2411 void ff_mmx_idct(DCTELEM *block); | |
2412 void ff_mmxext_idct(DCTELEM *block); | |
2413 | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2414 void ff_vp3_idct_sse2(int16_t *input_data); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2415 void ff_vp3_idct_mmx(int16_t *data); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2416 void ff_vp3_dsp_init_mmx(void); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2417 |
1092 | 2418 /* XXX: those functions should be suppressed ASAP when all IDCTs are |
2419 converted */ | |
2420 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) | |
2421 { | |
2422 ff_mmx_idct (block); | |
2423 put_pixels_clamped_mmx(block, dest, line_size); | |
2424 } | |
2425 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
2426 { | |
2427 ff_mmx_idct (block); | |
2428 add_pixels_clamped_mmx(block, dest, line_size); | |
2429 } | |
2430 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block) | |
2431 { | |
2432 ff_mmxext_idct (block); | |
2433 put_pixels_clamped_mmx(block, dest, line_size); | |
2434 } | |
2435 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
2436 { | |
2437 ff_mmxext_idct (block); | |
2438 add_pixels_clamped_mmx(block, dest, line_size); | |
2439 } | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2440 static void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block) |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2441 { |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2442 ff_vp3_idct_sse2(block); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2443 put_signed_pixels_clamped_mmx(block, dest, line_size); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2444 } |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2445 static void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block) |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2446 { |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2447 ff_vp3_idct_sse2(block); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2448 add_pixels_clamped_mmx(block, dest, line_size); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2449 } |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2450 static void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2451 { |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2452 ff_vp3_idct_mmx(block); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2453 put_signed_pixels_clamped_mmx(block, dest, line_size); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2454 } |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2455 static void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2456 { |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2457 ff_vp3_idct_mmx(block); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2458 add_pixels_clamped_mmx(block, dest, line_size); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2459 } |
2868 | 2460 #ifdef CONFIG_GPL |
2461 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) | |
2462 { | |
2463 ff_idct_xvid_mmx (block); | |
2464 put_pixels_clamped_mmx(block, dest, line_size); | |
2465 } | |
2466 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block) | |
2467 { | |
2468 ff_idct_xvid_mmx (block); | |
2469 add_pixels_clamped_mmx(block, dest, line_size); | |
2470 } | |
2471 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block) | |
2472 { | |
2473 ff_idct_xvid_mmx2 (block); | |
2474 put_pixels_clamped_mmx(block, dest, line_size); | |
2475 } | |
2476 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) | |
2477 { | |
2478 ff_idct_xvid_mmx2 (block); | |
2479 add_pixels_clamped_mmx(block, dest, line_size); | |
2480 } | |
2481 #endif | |
954 | 2482 |
1092 | 2483 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
0 | 2484 { |
2485 mm_flags = mm_support(); | |
1115 | 2486 |
1122 | 2487 if (avctx->dsp_mask) { |
2488 if (avctx->dsp_mask & FF_MM_FORCE) | |
2489 mm_flags |= (avctx->dsp_mask & 0xffff); | |
2490 else | |
2491 mm_flags &= ~(avctx->dsp_mask & 0xffff); | |
2492 } | |
1115 | 2493 |
631
47a8964ba5cd
be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents:
629
diff
changeset
|
2494 #if 0 |
1868 | 2495 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:"); |
0 | 2496 if (mm_flags & MM_MMX) |
1868 | 2497 av_log(avctx, AV_LOG_INFO, " mmx"); |
0 | 2498 if (mm_flags & MM_MMXEXT) |
1868 | 2499 av_log(avctx, AV_LOG_INFO, " mmxext"); |
0 | 2500 if (mm_flags & MM_3DNOW) |
1868 | 2501 av_log(avctx, AV_LOG_INFO, " 3dnow"); |
0 | 2502 if (mm_flags & MM_SSE) |
1868 | 2503 av_log(avctx, AV_LOG_INFO, " sse"); |
0 | 2504 if (mm_flags & MM_SSE2) |
1868 | 2505 av_log(avctx, AV_LOG_INFO, " sse2"); |
2506 av_log(avctx, AV_LOG_INFO, "\n"); | |
0 | 2507 #endif |
2508 | |
2509 if (mm_flags & MM_MMX) { | |
1092 | 2510 const int idct_algo= avctx->idct_algo; |
2511 | |
1232
e88d3b1fb2a1
more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents:
1186
diff
changeset
|
2512 #ifdef CONFIG_ENCODERS |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1985
diff
changeset
|
2513 const int dct_algo = avctx->dct_algo; |
1565 | 2514 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
2515 if(mm_flags & MM_SSE2){ |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
2516 c->fdct = ff_fdct_sse2; |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
2517 }else if(mm_flags & MM_MMXEXT){ |
1565 | 2518 c->fdct = ff_fdct_mmx2; |
2519 }else{ | |
2520 c->fdct = ff_fdct_mmx; | |
2521 } | |
2522 } | |
1232
e88d3b1fb2a1
more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents:
1186
diff
changeset
|
2523 #endif //CONFIG_ENCODERS |
2256 | 2524 if(avctx->lowres==0){ |
2525 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ | |
2526 c->idct_put= ff_simple_idct_put_mmx; | |
2527 c->idct_add= ff_simple_idct_add_mmx; | |
2528 c->idct = ff_simple_idct_mmx; | |
2529 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; | |
2530 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ | |
2531 if(mm_flags & MM_MMXEXT){ | |
2532 c->idct_put= ff_libmpeg2mmx2_idct_put; | |
2533 c->idct_add= ff_libmpeg2mmx2_idct_add; | |
2534 c->idct = ff_mmxext_idct; | |
2535 }else{ | |
2536 c->idct_put= ff_libmpeg2mmx_idct_put; | |
2537 c->idct_add= ff_libmpeg2mmx_idct_add; | |
2538 c->idct = ff_mmx_idct; | |
2539 } | |
2540 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2541 }else if(idct_algo==FF_IDCT_VP3){ |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2542 if(mm_flags & MM_SSE2){ |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2543 c->idct_put= ff_vp3_idct_put_sse2; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2544 c->idct_add= ff_vp3_idct_add_sse2; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2545 c->idct = ff_vp3_idct_sse2; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2546 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2547 }else{ |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2548 ff_vp3_dsp_init_mmx(); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2549 c->idct_put= ff_vp3_idct_put_mmx; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2550 c->idct_add= ff_vp3_idct_add_mmx; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2551 c->idct = ff_vp3_idct_mmx; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2552 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
2553 } |
2868 | 2554 #ifdef CONFIG_GPL |
2555 }else if(idct_algo==FF_IDCT_XVIDMMX){ | |
2556 if(mm_flags & MM_MMXEXT){ | |
2557 c->idct_put= ff_idct_xvid_mmx2_put; | |
2558 c->idct_add= ff_idct_xvid_mmx2_add; | |
2559 c->idct = ff_idct_xvid_mmx2; | |
2560 }else{ | |
2561 c->idct_put= ff_idct_xvid_mmx_put; | |
2562 c->idct_add= ff_idct_xvid_mmx_add; | |
2563 c->idct = ff_idct_xvid_mmx; | |
2564 } | |
2871
f4afa3812818
Fix compile without CONFIG_GPL, misplaced #endif caused a missing }.
reimar
parents:
2868
diff
changeset
|
2565 #endif |
1092 | 2566 } |
2567 } | |
1868 | 2568 |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2569 #ifdef CONFIG_ENCODERS |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2570 c->get_pixels = get_pixels_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2571 c->diff_pixels = diff_pixels_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2572 #endif //CONFIG_ENCODERS |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2573 c->put_pixels_clamped = put_pixels_clamped_mmx; |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
2574 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx; |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2575 c->add_pixels_clamped = add_pixels_clamped_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2576 c->clear_blocks = clear_blocks_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2577 #ifdef CONFIG_ENCODERS |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2578 c->pix_sum = pix_sum16_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2579 #endif //CONFIG_ENCODERS |
415 | 2580 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2581 c->put_pixels_tab[0][0] = put_pixels16_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2582 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2583 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2584 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx; |
0 | 2585 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2586 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2587 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2588 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2589 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx; |
651 | 2590 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2591 c->avg_pixels_tab[0][0] = avg_pixels16_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2592 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2593 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2594 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx; |
415 | 2595 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2596 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2597 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2598 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2599 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2600 |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2601 c->put_pixels_tab[1][0] = put_pixels8_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2602 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2603 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2604 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx; |
0 | 2605 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2606 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2607 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2608 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2609 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx; |
651 | 2610 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2611 c->avg_pixels_tab[1][0] = avg_pixels8_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2612 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2613 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2614 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx; |
651 | 2615 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2616 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2617 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2618 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2619 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx; |
954 | 2620 |
866 | 2621 c->add_bytes= add_bytes_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2622 #ifdef CONFIG_ENCODERS |
866 | 2623 c->diff_bytes= diff_bytes_mmx; |
936 | 2624 |
2625 c->hadamard8_diff[0]= hadamard8_diff16_mmx; | |
2626 c->hadamard8_diff[1]= hadamard8_diff_mmx; | |
2627 | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
2628 c->pix_norm1 = pix_norm1_mmx; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
2629 c->sse[0] = sse16_mmx; |
2067 | 2630 c->sse[1] = sse8_mmx; |
1729 | 2631 c->vsad[4]= vsad_intra16_mmx; |
2632 | |
2067 | 2633 c->nsse[0] = nsse16_mmx; |
2634 c->nsse[1] = nsse8_mmx; | |
1729 | 2635 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
2636 c->vsad[0] = vsad16_mmx; | |
2637 } | |
1784 | 2638 |
2639 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
2640 c->try_8x8basis= try_8x8basis_mmx; | |
2641 } | |
2642 c->add_8x8basis= add_8x8basis_mmx; | |
2643 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2644 #endif //CONFIG_ENCODERS |
1647 | 2645 |
2646 c->h263_v_loop_filter= h263_v_loop_filter_mmx; | |
1784 | 2647 c->h263_h_loop_filter= h263_h_loop_filter_mmx; |
2732 | 2648 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx; |
936 | 2649 |
0 | 2650 if (mm_flags & MM_MMXEXT) { |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2651 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2652 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; |
651 | 2653 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2654 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2655 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2656 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; |
415 | 2657 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2658 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2659 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2; |
651 | 2660 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2661 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2662 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2663 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; |
1092 | 2664 |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2665 #ifdef CONFIG_ENCODERS |
1153 | 2666 c->hadamard8_diff[0]= hadamard8_diff16_mmx2; |
2667 c->hadamard8_diff[1]= hadamard8_diff_mmx2; | |
1729 | 2668 c->vsad[4]= vsad_intra16_mmx2; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2669 #endif //CONFIG_ENCODERS |
1153 | 2670 |
2745 | 2671 c->h264_idct_add= ff_h264_idct_add_mmx2; |
2672 | |
1092 | 2673 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
2674 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; | |
2675 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; | |
2676 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; | |
2677 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; | |
2678 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; | |
2679 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; | |
1772
8cd5257195c9
vsad16_mmx2 only applies if encoders are turned on
melanson
parents:
1765
diff
changeset
|
2680 #ifdef CONFIG_ENCODERS |
1729 | 2681 c->vsad[0] = vsad16_mmx2; |
1772
8cd5257195c9
vsad16_mmx2 only applies if encoders are turned on
melanson
parents:
1765
diff
changeset
|
2682 #endif //CONFIG_ENCODERS |
1092 | 2683 } |
959 | 2684 |
961 | 2685 #if 1 |
954 | 2686 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2) |
2687 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2) | |
2688 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2) | |
2689 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2) | |
2690 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2) | |
2691 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2) | |
2692 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2) | |
2693 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2) | |
2694 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2) | |
2695 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2) | |
2696 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2) | |
2697 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2) | |
2698 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2) | |
2699 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2) | |
2700 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2) | |
2701 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2) | |
2702 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2) | |
2703 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2) | |
2704 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2) | |
2705 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2) | |
2706 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2) | |
2707 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2) | |
2708 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2) | |
2709 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2) | |
2710 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2) | |
2711 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2) | |
2712 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2) | |
2713 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2) | |
2714 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2) | |
2715 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2) | |
2716 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2) | |
2717 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2) | |
961 | 2718 #endif |
1527 | 2719 |
2209 | 2720 //FIXME 3dnow too |
2721 #define dspfunc(PFX, IDX, NUM) \ | |
2722 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \ | |
2723 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \ | |
2724 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \ | |
2725 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \ | |
2726 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \ | |
2727 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \ | |
2728 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \ | |
2729 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \ | |
2730 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \ | |
2731 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \ | |
2732 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \ | |
2733 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \ | |
2734 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \ | |
2735 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \ | |
2736 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \ | |
2737 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2 | |
2738 | |
2739 dspfunc(put_h264_qpel, 0, 16); | |
2740 dspfunc(put_h264_qpel, 1, 8); | |
2741 dspfunc(put_h264_qpel, 2, 4); | |
2742 dspfunc(avg_h264_qpel, 0, 16); | |
2743 dspfunc(avg_h264_qpel, 1, 8); | |
2744 dspfunc(avg_h264_qpel, 2, 4); | |
2745 #undef dspfunc | |
2746 | |
2732 | 2747 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2; |
2633 | 2748 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; |
2749 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; | |
2750 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; | |
2751 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2752 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2753 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; |
2633 | 2754 |
1686
68abbec33289
Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents:
1648
diff
changeset
|
2755 #ifdef CONFIG_ENCODERS |
1527 | 2756 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; |
1686
68abbec33289
Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents:
1648
diff
changeset
|
2757 #endif //CONFIG_ENCODERS |
0 | 2758 } else if (mm_flags & MM_3DNOW) { |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2759 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2760 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; |
393 | 2761 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2762 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2763 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2764 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; |
651 | 2765 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2766 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2767 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2768 |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2769 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2770 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
2771 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; |
1092 | 2772 |
2773 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
2774 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; | |
2775 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; | |
2776 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; | |
2777 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; | |
2778 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; | |
2779 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; | |
2780 } | |
984 | 2781 |
954 | 2782 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow) |
2783 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow) | |
2784 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow) | |
2785 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow) | |
2786 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow) | |
2787 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow) | |
2788 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow) | |
2789 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow) | |
2790 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow) | |
2791 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow) | |
2792 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow) | |
2793 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow) | |
2794 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow) | |
2795 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow) | |
2796 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow) | |
2797 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow) | |
2798 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow) | |
2799 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow) | |
2800 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow) | |
2801 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow) | |
2802 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow) | |
2803 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow) | |
2804 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow) | |
2805 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow) | |
2806 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow) | |
2807 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow) | |
2808 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow) | |
2809 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow) | |
2810 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow) | |
2811 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow) | |
2812 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow) | |
2813 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow) | |
2209 | 2814 |
2815 #define dspfunc(PFX, IDX, NUM) \ | |
2816 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \ | |
2817 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \ | |
2818 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \ | |
2819 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \ | |
2820 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \ | |
2821 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \ | |
2822 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \ | |
2823 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \ | |
2824 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \ | |
2825 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \ | |
2826 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \ | |
2827 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \ | |
2828 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \ | |
2829 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \ | |
2830 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \ | |
2831 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow | |
2832 | |
2833 dspfunc(put_h264_qpel, 0, 16); | |
2834 dspfunc(put_h264_qpel, 1, 8); | |
2835 dspfunc(put_h264_qpel, 2, 4); | |
2836 dspfunc(avg_h264_qpel, 0, 16); | |
2837 dspfunc(avg_h264_qpel, 1, 8); | |
2838 dspfunc(avg_h264_qpel, 2, 4); | |
2732 | 2839 |
2840 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow; | |
0 | 2841 } |
2842 } | |
1092 | 2843 |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2844 #ifdef CONFIG_ENCODERS |
1092 | 2845 dsputil_init_pix_mmx(c, avctx); |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2846 #endif //CONFIG_ENCODERS |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2847 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2848 // for speed testing |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2849 get_pixels = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2850 put_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2851 add_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2852 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2853 pix_abs16x16 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2854 pix_abs16x16_x2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2855 pix_abs16x16_y2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2856 pix_abs16x16_xy2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2857 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2858 put_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2859 put_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2860 put_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2861 put_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2862 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2863 put_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2864 put_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2865 put_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2866 put_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2867 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2868 avg_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2869 avg_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2870 avg_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2871 avg_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2872 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2873 avg_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2874 avg_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2875 avg_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2876 avg_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2877 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2878 //av_fdct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2879 //ff_idct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2880 #endif |
0 | 2881 } |