Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 1708:dea5b2946999 libavcodec
interlaced motion estimation
interlaced mpeg2 encoding
P & B frames
rate distored interlaced mb decission
alternate scantable support
4mv encoding fixes (thats also why the regression tests change)
passing height to most dsp functions
interlaced mpeg4 encoding (no direct mode MBs yet)
various related cleanups
disabled old motion estimaton algorithms (log, full, ...) they will either be fixed or removed
author | michael |
---|---|
date | Tue, 30 Dec 2003 16:07:57 +0000 |
parents | 68abbec33289 |
children | a4a5e7521339 |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
0 | 4 * |
429 | 5 * This library is free software; you can redistribute it and/or |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2 of the License, or (at your option) any later version. | |
0 | 9 * |
429 | 10 * This library is distributed in the hope that it will be useful, |
0 | 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 * Lesser General Public License for more details. | |
0 | 14 * |
429 | 15 * You should have received a copy of the GNU Lesser General Public |
16 * License along with this library; if not, write to the Free Software | |
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
0 | 18 * |
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 */ | |
21 | |
22 #include "../dsputil.h" | |
1092 | 23 #include "../simple_idct.h" |
0 | 24 |
1647 | 25 extern const uint8_t ff_h263_loop_filter_strength[32]; |
26 | |
5 | 27 int mm_flags; /* multimedia extension flags */ |
936 | 28 |
0 | 29 /* pixel operations */ |
387 | 30 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; |
31 static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | |
32 static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL; | |
0 | 33 |
954 | 34 static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL; |
35 static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003ULL; | |
36 static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL; | |
37 static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; | |
38 | |
1647 | 39 static const uint64_t ff_pb_FC __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL; |
40 | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
41 #define JUMPALIGN() __asm __volatile (".balign 8"::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
42 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
43 |
448 | 44 #define MOVQ_WONE(regd) \ |
45 __asm __volatile ( \ | |
46 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
47 "psrlw $15, %%" #regd ::) | |
48 | |
49 #define MOVQ_BFE(regd) \ | |
50 __asm __volatile ( \ | |
51 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ | |
52 "paddb %%" #regd ", %%" #regd " \n\t" ::) | |
53 | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
54 #ifndef PIC |
448 | 55 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone)) |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
56 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
57 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
58 // for shared library it's better to use this way for accessing constants |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
59 // pcmpeqd -> -1 |
448 | 60 #define MOVQ_BONE(regd) \ |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
61 __asm __volatile ( \ |
448 | 62 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
63 "psrlw $15, %%" #regd " \n\t" \ | |
64 "packuswb %%" #regd ", %%" #regd " \n\t" ::) | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
65 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
66 #define MOVQ_WTWO(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
67 __asm __volatile ( \ |
448 | 68 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
69 "psrlw $15, %%" #regd " \n\t" \ | |
70 "psllw $1, %%" #regd " \n\t"::) | |
387 | 71 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
72 #endif |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
73 |
448 | 74 // using regr as temporary and for the output result |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
75 // first argument is unmodifed and second is trashed |
471 | 76 // regfe is supposed to contain 0xfefefefefefefefe |
77 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ | |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
78 "movq " #rega ", " #regr " \n\t"\ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
79 "pand " #regb ", " #regr " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
80 "pxor " #rega ", " #regb " \n\t"\ |
471 | 81 "pand " #regfe "," #regb " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
82 "psrlq $1, " #regb " \n\t"\ |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
83 "paddb " #regb ", " #regr " \n\t" |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
84 |
471 | 85 #define PAVGB_MMX(rega, regb, regr, regfe) \ |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
86 "movq " #rega ", " #regr " \n\t"\ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
87 "por " #regb ", " #regr " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
88 "pxor " #rega ", " #regb " \n\t"\ |
471 | 89 "pand " #regfe "," #regb " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
90 "psrlq $1, " #regb " \n\t"\ |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
91 "psubb " #regb ", " #regr " \n\t" |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
92 |
471 | 93 // mm6 is supposed to contain 0xfefefefefefefefe |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
94 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
95 "movq " #rega ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
96 "movq " #regc ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
97 "pand " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
98 "pand " #regd ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
99 "pxor " #rega ", " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
100 "pxor " #regc ", " #regd " \n\t"\ |
448 | 101 "pand %%mm6, " #regb " \n\t"\ |
102 "pand %%mm6, " #regd " \n\t"\ | |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
103 "psrlq $1, " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
104 "psrlq $1, " #regd " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
105 "paddb " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
106 "paddb " #regd ", " #regp " \n\t" |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
107 |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
108 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
109 "movq " #rega ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
110 "movq " #regc ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
111 "por " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
112 "por " #regd ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
113 "pxor " #rega ", " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
114 "pxor " #regc ", " #regd " \n\t"\ |
448 | 115 "pand %%mm6, " #regb " \n\t"\ |
116 "pand %%mm6, " #regd " \n\t"\ | |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
117 "psrlq $1, " #regd " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
118 "psrlq $1, " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
119 "psubb " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
120 "psubb " #regd ", " #regp " \n\t" |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
121 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
122 /***********************************/ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
123 /* MMX no rounding */ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
124 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx |
448 | 125 #define SET_RND MOVQ_WONE |
126 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) | |
471 | 127 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
128 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
129 #include "dsputil_mmx_rnd.h" |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
130 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
131 #undef DEF |
448 | 132 #undef SET_RND |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
133 #undef PAVGBP |
471 | 134 #undef PAVGB |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
135 /***********************************/ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
136 /* MMX rounding */ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
137 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
138 #define DEF(x, y) x ## _ ## y ##_mmx |
448 | 139 #define SET_RND MOVQ_WTWO |
140 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) | |
471 | 141 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
142 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
143 #include "dsputil_mmx_rnd.h" |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
144 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
145 #undef DEF |
448 | 146 #undef SET_RND |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
147 #undef PAVGBP |
471 | 148 #undef PAVGB |
387 | 149 |
0 | 150 /***********************************/ |
151 /* 3Dnow specific */ | |
152 | |
153 #define DEF(x) x ## _3dnow | |
154 /* for Athlons PAVGUSB is prefered */ | |
155 #define PAVGB "pavgusb" | |
156 | |
157 #include "dsputil_mmx_avg.h" | |
158 | |
159 #undef DEF | |
160 #undef PAVGB | |
161 | |
162 /***********************************/ | |
163 /* MMX2 specific */ | |
164 | |
386 | 165 #define DEF(x) x ## _mmx2 |
0 | 166 |
167 /* Introduced only in MMX2 set */ | |
168 #define PAVGB "pavgb" | |
169 | |
170 #include "dsputil_mmx_avg.h" | |
171 | |
172 #undef DEF | |
173 #undef PAVGB | |
174 | |
175 /***********************************/ | |
176 /* standard MMX */ | |
177 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
178 #ifdef CONFIG_ENCODERS |
1064 | 179 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) |
0 | 180 { |
386 | 181 asm volatile( |
182 "movl $-128, %%eax \n\t" | |
183 "pxor %%mm7, %%mm7 \n\t" | |
184 ".balign 16 \n\t" | |
185 "1: \n\t" | |
186 "movq (%0), %%mm0 \n\t" | |
187 "movq (%0, %2), %%mm2 \n\t" | |
188 "movq %%mm0, %%mm1 \n\t" | |
189 "movq %%mm2, %%mm3 \n\t" | |
190 "punpcklbw %%mm7, %%mm0 \n\t" | |
191 "punpckhbw %%mm7, %%mm1 \n\t" | |
192 "punpcklbw %%mm7, %%mm2 \n\t" | |
193 "punpckhbw %%mm7, %%mm3 \n\t" | |
194 "movq %%mm0, (%1, %%eax)\n\t" | |
195 "movq %%mm1, 8(%1, %%eax)\n\t" | |
196 "movq %%mm2, 16(%1, %%eax)\n\t" | |
197 "movq %%mm3, 24(%1, %%eax)\n\t" | |
198 "addl %3, %0 \n\t" | |
199 "addl $32, %%eax \n\t" | |
200 "js 1b \n\t" | |
201 : "+r" (pixels) | |
202 : "r" (block+64), "r" (line_size), "r" (line_size*2) | |
203 : "%eax" | |
204 ); | |
0 | 205 } |
206 | |
1064 | 207 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) |
324 | 208 { |
209 asm volatile( | |
386 | 210 "pxor %%mm7, %%mm7 \n\t" |
211 "movl $-128, %%eax \n\t" | |
324 | 212 ".balign 16 \n\t" |
213 "1: \n\t" | |
214 "movq (%0), %%mm0 \n\t" | |
215 "movq (%1), %%mm2 \n\t" | |
216 "movq %%mm0, %%mm1 \n\t" | |
217 "movq %%mm2, %%mm3 \n\t" | |
218 "punpcklbw %%mm7, %%mm0 \n\t" | |
219 "punpckhbw %%mm7, %%mm1 \n\t" | |
220 "punpcklbw %%mm7, %%mm2 \n\t" | |
221 "punpckhbw %%mm7, %%mm3 \n\t" | |
222 "psubw %%mm2, %%mm0 \n\t" | |
223 "psubw %%mm3, %%mm1 \n\t" | |
224 "movq %%mm0, (%2, %%eax)\n\t" | |
225 "movq %%mm1, 8(%2, %%eax)\n\t" | |
226 "addl %3, %0 \n\t" | |
227 "addl %3, %1 \n\t" | |
228 "addl $16, %%eax \n\t" | |
229 "jnz 1b \n\t" | |
230 : "+r" (s1), "+r" (s2) | |
231 : "r" (block+64), "r" (stride) | |
232 : "%eax" | |
233 ); | |
234 } | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
235 #endif //CONFIG_ENCODERS |
324 | 236 |
1064 | 237 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
0 | 238 { |
239 const DCTELEM *p; | |
1064 | 240 uint8_t *pix; |
0 | 241 |
242 /* read the pixels */ | |
243 p = block; | |
244 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
245 /* unrolled loop */ |
0 | 246 __asm __volatile( |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
247 "movq %3, %%mm0\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
248 "movq 8%3, %%mm1\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
249 "movq 16%3, %%mm2\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
250 "movq 24%3, %%mm3\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
251 "movq 32%3, %%mm4\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
252 "movq 40%3, %%mm5\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
253 "movq 48%3, %%mm6\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
254 "movq 56%3, %%mm7\n\t" |
0 | 255 "packuswb %%mm1, %%mm0\n\t" |
256 "packuswb %%mm3, %%mm2\n\t" | |
257 "packuswb %%mm5, %%mm4\n\t" | |
258 "packuswb %%mm7, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
259 "movq %%mm0, (%0)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
260 "movq %%mm2, (%0, %1)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
261 "movq %%mm4, (%0, %1, 2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
262 "movq %%mm6, (%0, %2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
263 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) |
0 | 264 :"memory"); |
265 pix += line_size*4; | |
266 p += 32; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
267 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
268 // if here would be an exact copy of the code above |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
269 // compiler would generate some very strange code |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
270 // thus using "r" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
271 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
272 "movq (%3), %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
273 "movq 8(%3), %%mm1\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
274 "movq 16(%3), %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
275 "movq 24(%3), %%mm3\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
276 "movq 32(%3), %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
277 "movq 40(%3), %%mm5\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
278 "movq 48(%3), %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
279 "movq 56(%3), %%mm7\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
280 "packuswb %%mm1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
281 "packuswb %%mm3, %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
282 "packuswb %%mm5, %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
283 "packuswb %%mm7, %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
284 "movq %%mm0, (%0)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
285 "movq %%mm2, (%0, %1)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
286 "movq %%mm4, (%0, %1, 2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
287 "movq %%mm6, (%0, %2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
288 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
289 :"memory"); |
0 | 290 } |
291 | |
1064 | 292 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
0 | 293 { |
294 const DCTELEM *p; | |
1064 | 295 uint8_t *pix; |
0 | 296 int i; |
297 | |
298 /* read the pixels */ | |
299 p = block; | |
300 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
301 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
302 i = 4; |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
303 do { |
0 | 304 __asm __volatile( |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
305 "movq (%2), %%mm0\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
306 "movq 8(%2), %%mm1\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
307 "movq 16(%2), %%mm2\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
308 "movq 24(%2), %%mm3\n\t" |
0 | 309 "movq %0, %%mm4\n\t" |
310 "movq %1, %%mm6\n\t" | |
311 "movq %%mm4, %%mm5\n\t" | |
312 "punpcklbw %%mm7, %%mm4\n\t" | |
313 "punpckhbw %%mm7, %%mm5\n\t" | |
314 "paddsw %%mm4, %%mm0\n\t" | |
315 "paddsw %%mm5, %%mm1\n\t" | |
316 "movq %%mm6, %%mm5\n\t" | |
317 "punpcklbw %%mm7, %%mm6\n\t" | |
318 "punpckhbw %%mm7, %%mm5\n\t" | |
319 "paddsw %%mm6, %%mm2\n\t" | |
320 "paddsw %%mm5, %%mm3\n\t" | |
321 "packuswb %%mm1, %%mm0\n\t" | |
322 "packuswb %%mm3, %%mm2\n\t" | |
323 "movq %%mm0, %0\n\t" | |
324 "movq %%mm2, %1\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
325 :"+m"(*pix), "+m"(*(pix+line_size)) |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
326 :"r"(p) |
0 | 327 :"memory"); |
328 pix += line_size*2; | |
329 p += 16; | |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
330 } while (--i); |
0 | 331 } |
332 | |
1064 | 333 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 334 { |
471 | 335 __asm __volatile( |
420 | 336 "lea (%3, %3), %%eax \n\t" |
422 | 337 ".balign 8 \n\t" |
420 | 338 "1: \n\t" |
339 "movq (%1), %%mm0 \n\t" | |
340 "movq (%1, %3), %%mm1 \n\t" | |
341 "movq %%mm0, (%2) \n\t" | |
342 "movq %%mm1, (%2, %3) \n\t" | |
343 "addl %%eax, %1 \n\t" | |
344 "addl %%eax, %2 \n\t" | |
345 "movq (%1), %%mm0 \n\t" | |
346 "movq (%1, %3), %%mm1 \n\t" | |
347 "movq %%mm0, (%2) \n\t" | |
348 "movq %%mm1, (%2, %3) \n\t" | |
349 "addl %%eax, %1 \n\t" | |
350 "addl %%eax, %2 \n\t" | |
351 "subl $4, %0 \n\t" | |
352 "jnz 1b \n\t" | |
353 : "+g"(h), "+r" (pixels), "+r" (block) | |
354 : "r"(line_size) | |
355 : "%eax", "memory" | |
356 ); | |
0 | 357 } |
358 | |
1064 | 359 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
651 | 360 { |
361 __asm __volatile( | |
362 "lea (%3, %3), %%eax \n\t" | |
363 ".balign 8 \n\t" | |
364 "1: \n\t" | |
365 "movq (%1), %%mm0 \n\t" | |
366 "movq 8(%1), %%mm4 \n\t" | |
367 "movq (%1, %3), %%mm1 \n\t" | |
368 "movq 8(%1, %3), %%mm5 \n\t" | |
369 "movq %%mm0, (%2) \n\t" | |
370 "movq %%mm4, 8(%2) \n\t" | |
371 "movq %%mm1, (%2, %3) \n\t" | |
372 "movq %%mm5, 8(%2, %3) \n\t" | |
373 "addl %%eax, %1 \n\t" | |
374 "addl %%eax, %2 \n\t" | |
375 "movq (%1), %%mm0 \n\t" | |
376 "movq 8(%1), %%mm4 \n\t" | |
377 "movq (%1, %3), %%mm1 \n\t" | |
378 "movq 8(%1, %3), %%mm5 \n\t" | |
379 "movq %%mm0, (%2) \n\t" | |
380 "movq %%mm4, 8(%2) \n\t" | |
381 "movq %%mm1, (%2, %3) \n\t" | |
382 "movq %%mm5, 8(%2, %3) \n\t" | |
383 "addl %%eax, %1 \n\t" | |
384 "addl %%eax, %2 \n\t" | |
385 "subl $4, %0 \n\t" | |
386 "jnz 1b \n\t" | |
387 : "+g"(h), "+r" (pixels), "+r" (block) | |
388 : "r"(line_size) | |
389 : "%eax", "memory" | |
390 ); | |
391 } | |
392 | |
296 | 393 static void clear_blocks_mmx(DCTELEM *blocks) |
394 { | |
471 | 395 __asm __volatile( |
296 | 396 "pxor %%mm7, %%mm7 \n\t" |
397 "movl $-128*6, %%eax \n\t" | |
398 "1: \n\t" | |
399 "movq %%mm7, (%0, %%eax) \n\t" | |
400 "movq %%mm7, 8(%0, %%eax) \n\t" | |
401 "movq %%mm7, 16(%0, %%eax) \n\t" | |
402 "movq %%mm7, 24(%0, %%eax) \n\t" | |
403 "addl $32, %%eax \n\t" | |
404 " js 1b \n\t" | |
405 : : "r" (((int)blocks)+128*6) | |
406 : "%eax" | |
407 ); | |
408 } | |
409 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
410 #ifdef CONFIG_ENCODERS |
1064 | 411 static int pix_sum16_mmx(uint8_t * pix, int line_size){ |
688 | 412 const int h=16; |
413 int sum; | |
414 int index= -line_size*h; | |
415 | |
416 __asm __volatile( | |
417 "pxor %%mm7, %%mm7 \n\t" | |
418 "pxor %%mm6, %%mm6 \n\t" | |
419 "1: \n\t" | |
420 "movq (%2, %1), %%mm0 \n\t" | |
421 "movq (%2, %1), %%mm1 \n\t" | |
422 "movq 8(%2, %1), %%mm2 \n\t" | |
423 "movq 8(%2, %1), %%mm3 \n\t" | |
424 "punpcklbw %%mm7, %%mm0 \n\t" | |
425 "punpckhbw %%mm7, %%mm1 \n\t" | |
426 "punpcklbw %%mm7, %%mm2 \n\t" | |
427 "punpckhbw %%mm7, %%mm3 \n\t" | |
428 "paddw %%mm0, %%mm1 \n\t" | |
429 "paddw %%mm2, %%mm3 \n\t" | |
430 "paddw %%mm1, %%mm3 \n\t" | |
431 "paddw %%mm3, %%mm6 \n\t" | |
432 "addl %3, %1 \n\t" | |
433 " js 1b \n\t" | |
434 "movq %%mm6, %%mm5 \n\t" | |
435 "psrlq $32, %%mm6 \n\t" | |
436 "paddw %%mm5, %%mm6 \n\t" | |
437 "movq %%mm6, %%mm5 \n\t" | |
438 "psrlq $16, %%mm6 \n\t" | |
439 "paddw %%mm5, %%mm6 \n\t" | |
440 "movd %%mm6, %0 \n\t" | |
441 "andl $0xFFFF, %0 \n\t" | |
442 : "=&r" (sum), "+r" (index) | |
443 : "r" (pix - index), "r" (line_size) | |
444 ); | |
445 | |
446 return sum; | |
447 } | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
448 #endif //CONFIG_ENCODERS |
688 | 449 |
866 | 450 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ |
451 int i=0; | |
452 asm volatile( | |
453 "1: \n\t" | |
454 "movq (%1, %0), %%mm0 \n\t" | |
455 "movq (%2, %0), %%mm1 \n\t" | |
456 "paddb %%mm0, %%mm1 \n\t" | |
457 "movq %%mm1, (%2, %0) \n\t" | |
458 "movq 8(%1, %0), %%mm0 \n\t" | |
459 "movq 8(%2, %0), %%mm1 \n\t" | |
460 "paddb %%mm0, %%mm1 \n\t" | |
461 "movq %%mm1, 8(%2, %0) \n\t" | |
462 "addl $16, %0 \n\t" | |
463 "cmpl %3, %0 \n\t" | |
464 " jb 1b \n\t" | |
465 : "+r" (i) | |
466 : "r"(src), "r"(dst), "r"(w-15) | |
467 ); | |
468 for(; i<w; i++) | |
469 dst[i+0] += src[i+0]; | |
470 } | |
471 | |
1648 | 472 #define H263_LOOP_FILTER \ |
473 "pxor %%mm7, %%mm7 \n\t"\ | |
474 "movq %0, %%mm0 \n\t"\ | |
475 "movq %0, %%mm1 \n\t"\ | |
476 "movq %3, %%mm2 \n\t"\ | |
477 "movq %3, %%mm3 \n\t"\ | |
478 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
479 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
480 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
481 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
482 "psubw %%mm2, %%mm0 \n\t"\ | |
483 "psubw %%mm3, %%mm1 \n\t"\ | |
484 "movq %1, %%mm2 \n\t"\ | |
485 "movq %1, %%mm3 \n\t"\ | |
486 "movq %2, %%mm4 \n\t"\ | |
487 "movq %2, %%mm5 \n\t"\ | |
488 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
489 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
490 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
491 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
492 "psubw %%mm2, %%mm4 \n\t"\ | |
493 "psubw %%mm3, %%mm5 \n\t"\ | |
494 "psllw $2, %%mm4 \n\t"\ | |
495 "psllw $2, %%mm5 \n\t"\ | |
496 "paddw %%mm0, %%mm4 \n\t"\ | |
497 "paddw %%mm1, %%mm5 \n\t"\ | |
498 "pxor %%mm6, %%mm6 \n\t"\ | |
499 "pcmpgtw %%mm4, %%mm6 \n\t"\ | |
500 "pcmpgtw %%mm5, %%mm7 \n\t"\ | |
501 "pxor %%mm6, %%mm4 \n\t"\ | |
502 "pxor %%mm7, %%mm5 \n\t"\ | |
503 "psubw %%mm6, %%mm4 \n\t"\ | |
504 "psubw %%mm7, %%mm5 \n\t"\ | |
505 "psrlw $3, %%mm4 \n\t"\ | |
506 "psrlw $3, %%mm5 \n\t"\ | |
507 "packuswb %%mm5, %%mm4 \n\t"\ | |
508 "packsswb %%mm7, %%mm6 \n\t"\ | |
509 "pxor %%mm7, %%mm7 \n\t"\ | |
510 "movd %4, %%mm2 \n\t"\ | |
511 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
512 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
513 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
514 "psubusb %%mm4, %%mm2 \n\t"\ | |
515 "movq %%mm2, %%mm3 \n\t"\ | |
516 "psubusb %%mm4, %%mm3 \n\t"\ | |
517 "psubb %%mm3, %%mm2 \n\t"\ | |
518 "movq %1, %%mm3 \n\t"\ | |
519 "movq %2, %%mm4 \n\t"\ | |
520 "pxor %%mm6, %%mm3 \n\t"\ | |
521 "pxor %%mm6, %%mm4 \n\t"\ | |
522 "paddusb %%mm2, %%mm3 \n\t"\ | |
523 "psubusb %%mm2, %%mm4 \n\t"\ | |
524 "pxor %%mm6, %%mm3 \n\t"\ | |
525 "pxor %%mm6, %%mm4 \n\t"\ | |
526 "paddusb %%mm2, %%mm2 \n\t"\ | |
527 "packsswb %%mm1, %%mm0 \n\t"\ | |
528 "pcmpgtb %%mm0, %%mm7 \n\t"\ | |
529 "pxor %%mm7, %%mm0 \n\t"\ | |
530 "psubb %%mm7, %%mm0 \n\t"\ | |
531 "movq %%mm0, %%mm1 \n\t"\ | |
532 "psubusb %%mm2, %%mm0 \n\t"\ | |
533 "psubb %%mm0, %%mm1 \n\t"\ | |
534 "pand %5, %%mm1 \n\t"\ | |
535 "psrlw $2, %%mm1 \n\t"\ | |
536 "pxor %%mm7, %%mm1 \n\t"\ | |
537 "psubb %%mm7, %%mm1 \n\t"\ | |
538 "movq %0, %%mm5 \n\t"\ | |
539 "movq %3, %%mm6 \n\t"\ | |
540 "psubb %%mm1, %%mm5 \n\t"\ | |
541 "paddb %%mm1, %%mm6 \n\t" | |
542 | |
1647 | 543 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ |
544 const int strength= ff_h263_loop_filter_strength[qscale]; | |
545 | |
546 asm volatile( | |
1648 | 547 |
548 H263_LOOP_FILTER | |
549 | |
1647 | 550 "movq %%mm3, %1 \n\t" |
551 "movq %%mm4, %2 \n\t" | |
1648 | 552 "movq %%mm5, %0 \n\t" |
553 "movq %%mm6, %3 \n\t" | |
1647 | 554 : "+m" (*(uint64_t*)(src - 2*stride)), |
555 "+m" (*(uint64_t*)(src - 1*stride)), | |
556 "+m" (*(uint64_t*)(src + 0*stride)), | |
557 "+m" (*(uint64_t*)(src + 1*stride)) | |
558 : "g" (2*strength), "m"(ff_pb_FC) | |
559 ); | |
560 } | |
561 | |
1648 | 562 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ |
563 asm volatile( //FIXME could save 1 instruction if done as 8x4 ... | |
564 "movd %4, %%mm0 \n\t" | |
565 "movd %5, %%mm1 \n\t" | |
566 "movd %6, %%mm2 \n\t" | |
567 "movd %7, %%mm3 \n\t" | |
568 "punpcklbw %%mm1, %%mm0 \n\t" | |
569 "punpcklbw %%mm3, %%mm2 \n\t" | |
570 "movq %%mm0, %%mm1 \n\t" | |
571 "punpcklwd %%mm2, %%mm0 \n\t" | |
572 "punpckhwd %%mm2, %%mm1 \n\t" | |
573 "movd %%mm0, %0 \n\t" | |
574 "punpckhdq %%mm0, %%mm0 \n\t" | |
575 "movd %%mm0, %1 \n\t" | |
576 "movd %%mm1, %2 \n\t" | |
577 "punpckhdq %%mm1, %%mm1 \n\t" | |
578 "movd %%mm1, %3 \n\t" | |
579 | |
580 : "=m" (*(uint32_t*)(dst + 0*dst_stride)), | |
581 "=m" (*(uint32_t*)(dst + 1*dst_stride)), | |
582 "=m" (*(uint32_t*)(dst + 2*dst_stride)), | |
583 "=m" (*(uint32_t*)(dst + 3*dst_stride)) | |
584 : "m" (*(uint32_t*)(src + 0*src_stride)), | |
585 "m" (*(uint32_t*)(src + 1*src_stride)), | |
586 "m" (*(uint32_t*)(src + 2*src_stride)), | |
587 "m" (*(uint32_t*)(src + 3*src_stride)) | |
588 ); | |
589 } | |
590 | |
591 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ | |
592 const int strength= ff_h263_loop_filter_strength[qscale]; | |
593 uint64_t temp[4] __attribute__ ((aligned(8))); | |
594 uint8_t *btemp= (uint8_t*)temp; | |
595 | |
596 src -= 2; | |
597 | |
598 transpose4x4(btemp , src , 8, stride); | |
599 transpose4x4(btemp+4, src + 4*stride, 8, stride); | |
600 asm volatile( | |
601 H263_LOOP_FILTER // 5 3 4 6 | |
602 | |
603 : "+m" (temp[0]), | |
604 "+m" (temp[1]), | |
605 "+m" (temp[2]), | |
606 "+m" (temp[3]) | |
607 : "g" (2*strength), "m"(ff_pb_FC) | |
608 ); | |
609 | |
610 asm volatile( | |
611 "movq %%mm5, %%mm1 \n\t" | |
612 "movq %%mm4, %%mm0 \n\t" | |
613 "punpcklbw %%mm3, %%mm5 \n\t" | |
614 "punpcklbw %%mm6, %%mm4 \n\t" | |
615 "punpckhbw %%mm3, %%mm1 \n\t" | |
616 "punpckhbw %%mm6, %%mm0 \n\t" | |
617 "movq %%mm5, %%mm3 \n\t" | |
618 "movq %%mm1, %%mm6 \n\t" | |
619 "punpcklwd %%mm4, %%mm5 \n\t" | |
620 "punpcklwd %%mm0, %%mm1 \n\t" | |
621 "punpckhwd %%mm4, %%mm3 \n\t" | |
622 "punpckhwd %%mm0, %%mm6 \n\t" | |
623 "movd %%mm5, %0 \n\t" | |
624 "punpckhdq %%mm5, %%mm5 \n\t" | |
625 "movd %%mm5, %1 \n\t" | |
626 "movd %%mm3, %2 \n\t" | |
627 "punpckhdq %%mm3, %%mm3 \n\t" | |
628 "movd %%mm3, %3 \n\t" | |
629 "movd %%mm1, %4 \n\t" | |
630 "punpckhdq %%mm1, %%mm1 \n\t" | |
631 "movd %%mm1, %5 \n\t" | |
632 "movd %%mm6, %6 \n\t" | |
633 "punpckhdq %%mm6, %%mm6 \n\t" | |
634 "movd %%mm6, %7 \n\t" | |
635 : "=m" (*(uint32_t*)(src + 0*stride)), | |
636 "=m" (*(uint32_t*)(src + 1*stride)), | |
637 "=m" (*(uint32_t*)(src + 2*stride)), | |
638 "=m" (*(uint32_t*)(src + 3*stride)), | |
639 "=m" (*(uint32_t*)(src + 4*stride)), | |
640 "=m" (*(uint32_t*)(src + 5*stride)), | |
641 "=m" (*(uint32_t*)(src + 6*stride)), | |
642 "=m" (*(uint32_t*)(src + 7*stride)) | |
643 ); | |
644 } | |
645 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
646 #ifdef CONFIG_ENCODERS |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
647 static int pix_norm1_mmx(uint8_t *pix, int line_size) { |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
648 int tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
649 asm volatile ( |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
650 "movl $16,%%ecx\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
651 "pxor %%mm0,%%mm0\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
652 "pxor %%mm7,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
653 "1:\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
654 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
655 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
656 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
657 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
658 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
659 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
660 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
661 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
662 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
663 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
664 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
665 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
666 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
667 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
668 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
669 "pmaddwd %%mm3,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
670 "pmaddwd %%mm4,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
671 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
672 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
673 pix2^2+pix3^2+pix6^2+pix7^2) */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
674 "paddd %%mm3,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
675 "paddd %%mm2,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
676 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
677 "addl %2, %0\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
678 "paddd %%mm4,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
679 "dec %%ecx\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
680 "jnz 1b\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
681 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
682 "movq %%mm7,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
683 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
684 "paddd %%mm7,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
685 "movd %%mm1,%1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
686 : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" ); |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
687 return tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
688 } |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
689 |
1708 | 690 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
691 int tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
692 asm volatile ( |
1708 | 693 "movl %4,%%ecx\n" |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
694 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
695 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
696 "1:\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
697 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
698 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
699 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
700 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
701 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
702 /* todo: mm1-mm2, mm3-mm4 */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
703 /* algo: substract mm1 from mm2 with saturation and vice versa */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
704 /* OR the results to get absolute difference */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
705 "movq %%mm1,%%mm5\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
706 "movq %%mm3,%%mm6\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
707 "psubusb %%mm2,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
708 "psubusb %%mm4,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
709 "psubusb %%mm5,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
710 "psubusb %%mm6,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
711 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
712 "por %%mm1,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
713 "por %%mm3,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
714 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
715 /* now convert to 16-bit vectors so we can square them */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
716 "movq %%mm2,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
717 "movq %%mm4,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
718 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
719 "punpckhbw %%mm0,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
720 "punpckhbw %%mm0,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
721 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
722 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
723 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
724 "pmaddwd %%mm2,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
725 "pmaddwd %%mm4,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
726 "pmaddwd %%mm1,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
727 "pmaddwd %%mm3,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
728 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
729 "addl %3,%0\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
730 "addl %3,%1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
731 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
732 "paddd %%mm2,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
733 "paddd %%mm4,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
734 "paddd %%mm1,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
735 "paddd %%mm3,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
736 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
737 "decl %%ecx\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
738 "jnz 1b\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
739 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
740 "movq %%mm7,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
741 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
742 "paddd %%mm7,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
743 "movd %%mm1,%2\n" |
1708 | 744 : "+r" (pix1), "+r" (pix2), "=r"(tmp) |
745 : "r" (line_size) , "m" (h) | |
746 : "%ecx"); | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
747 return tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
748 } |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
749 |
866 | 750 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ |
751 int i=0; | |
752 asm volatile( | |
753 "1: \n\t" | |
754 "movq (%2, %0), %%mm0 \n\t" | |
755 "movq (%1, %0), %%mm1 \n\t" | |
756 "psubb %%mm0, %%mm1 \n\t" | |
757 "movq %%mm1, (%3, %0) \n\t" | |
758 "movq 8(%2, %0), %%mm0 \n\t" | |
759 "movq 8(%1, %0), %%mm1 \n\t" | |
760 "psubb %%mm0, %%mm1 \n\t" | |
761 "movq %%mm1, 8(%3, %0) \n\t" | |
762 "addl $16, %0 \n\t" | |
763 "cmpl %4, %0 \n\t" | |
764 " jb 1b \n\t" | |
765 : "+r" (i) | |
766 : "r"(src1), "r"(src2), "r"(dst), "r"(w-15) | |
767 ); | |
768 for(; i<w; i++) | |
769 dst[i+0] = src1[i+0]-src2[i+0]; | |
770 } | |
1527 | 771 |
772 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ | |
773 int i=0; | |
774 uint8_t l, lt; | |
775 | |
776 asm volatile( | |
777 "1: \n\t" | |
778 "movq -1(%1, %0), %%mm0 \n\t" // LT | |
779 "movq (%1, %0), %%mm1 \n\t" // T | |
780 "movq -1(%2, %0), %%mm2 \n\t" // L | |
781 "movq (%2, %0), %%mm3 \n\t" // X | |
782 "movq %%mm2, %%mm4 \n\t" // L | |
783 "psubb %%mm0, %%mm2 \n\t" | |
784 "paddb %%mm1, %%mm2 \n\t" // L + T - LT | |
785 "movq %%mm4, %%mm5 \n\t" // L | |
786 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) | |
787 "pminub %%mm5, %%mm1 \n\t" // min(T, L) | |
788 "pminub %%mm2, %%mm4 \n\t" | |
789 "pmaxub %%mm1, %%mm4 \n\t" | |
790 "psubb %%mm4, %%mm3 \n\t" // dst - pred | |
791 "movq %%mm3, (%3, %0) \n\t" | |
792 "addl $8, %0 \n\t" | |
793 "cmpl %4, %0 \n\t" | |
794 " jb 1b \n\t" | |
795 : "+r" (i) | |
796 : "r"(src1), "r"(src2), "r"(dst), "r"(w) | |
797 ); | |
798 | |
799 l= *left; | |
800 lt= *left_top; | |
801 | |
802 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); | |
803 | |
804 *left_top= src1[w-1]; | |
805 *left = src2[w-1]; | |
806 } | |
807 | |
1153 | 808 #define LBUTTERFLY2(a1,b1,a2,b2)\ |
809 "paddw " #b1 ", " #a1 " \n\t"\ | |
810 "paddw " #b2 ", " #a2 " \n\t"\ | |
811 "paddw " #b1 ", " #b1 " \n\t"\ | |
812 "paddw " #b2 ", " #b2 " \n\t"\ | |
813 "psubw " #a1 ", " #b1 " \n\t"\ | |
1186 | 814 "psubw " #a2 ", " #b2 " \n\t" |
866 | 815 |
936 | 816 #define HADAMARD48\ |
1153 | 817 LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\ |
818 LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\ | |
819 LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\ | |
820 LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\ | |
821 LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\ | |
822 LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\ | |
936 | 823 |
824 #define MMABS(a,z)\ | |
825 "pxor " #z ", " #z " \n\t"\ | |
826 "pcmpgtw " #a ", " #z " \n\t"\ | |
827 "pxor " #z ", " #a " \n\t"\ | |
828 "psubw " #z ", " #a " \n\t" | |
829 | |
830 #define MMABS_SUM(a,z, sum)\ | |
831 "pxor " #z ", " #z " \n\t"\ | |
832 "pcmpgtw " #a ", " #z " \n\t"\ | |
833 "pxor " #z ", " #a " \n\t"\ | |
834 "psubw " #z ", " #a " \n\t"\ | |
835 "paddusw " #a ", " #sum " \n\t" | |
836 | |
1153 | 837 #define MMABS_MMX2(a,z)\ |
838 "pxor " #z ", " #z " \n\t"\ | |
839 "psubw " #a ", " #z " \n\t"\ | |
840 "pmaxsw " #z ", " #a " \n\t" | |
841 | |
842 #define MMABS_SUM_MMX2(a,z, sum)\ | |
843 "pxor " #z ", " #z " \n\t"\ | |
844 "psubw " #a ", " #z " \n\t"\ | |
845 "pmaxsw " #z ", " #a " \n\t"\ | |
846 "paddusw " #a ", " #sum " \n\t" | |
847 | |
936 | 848 #define SBUTTERFLY(a,b,t,n)\ |
849 "movq " #a ", " #t " \n\t" /* abcd */\ | |
850 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ | |
851 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ | |
1153 | 852 |
936 | 853 #define TRANSPOSE4(a,b,c,d,t)\ |
854 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\ | |
855 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\ | |
856 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\ | |
857 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */ | |
858 | |
859 #define LOAD4(o, a, b, c, d)\ | |
860 "movq "#o"(%1), " #a " \n\t"\ | |
861 "movq "#o"+16(%1), " #b " \n\t"\ | |
862 "movq "#o"+32(%1), " #c " \n\t"\ | |
863 "movq "#o"+48(%1), " #d " \n\t" | |
864 | |
865 #define STORE4(o, a, b, c, d)\ | |
866 "movq "#a", "#o"(%1) \n\t"\ | |
867 "movq "#b", "#o"+16(%1) \n\t"\ | |
868 "movq "#c", "#o"+32(%1) \n\t"\ | |
869 "movq "#d", "#o"+48(%1) \n\t"\ | |
870 | |
1708 | 871 static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ |
936 | 872 uint64_t temp[16] __align8; |
873 int sum=0; | |
1708 | 874 |
875 assert(h==8); | |
936 | 876 |
877 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); | |
878 | |
879 asm volatile( | |
880 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) | |
881 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) | |
882 | |
883 HADAMARD48 | |
884 | |
885 "movq %%mm7, 112(%1) \n\t" | |
886 | |
887 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) | |
888 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) | |
889 | |
890 "movq 112(%1), %%mm7 \n\t" | |
891 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) | |
892 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) | |
893 | |
894 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) | |
895 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) | |
896 | |
897 HADAMARD48 | |
898 | |
899 "movq %%mm7, 120(%1) \n\t" | |
900 | |
901 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) | |
902 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2) | |
903 | |
904 "movq 120(%1), %%mm7 \n\t" | |
905 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) | |
906 "movq %%mm7, %%mm5 \n\t"//FIXME remove | |
907 "movq %%mm6, %%mm7 \n\t" | |
908 "movq %%mm0, %%mm6 \n\t" | |
909 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove | |
910 | |
911 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3) | |
912 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) | |
913 | |
914 HADAMARD48 | |
915 "movq %%mm7, 64(%1) \n\t" | |
916 MMABS(%%mm0, %%mm7) | |
917 MMABS_SUM(%%mm1, %%mm7, %%mm0) | |
918 MMABS_SUM(%%mm2, %%mm7, %%mm0) | |
919 MMABS_SUM(%%mm3, %%mm7, %%mm0) | |
920 MMABS_SUM(%%mm4, %%mm7, %%mm0) | |
921 MMABS_SUM(%%mm5, %%mm7, %%mm0) | |
922 MMABS_SUM(%%mm6, %%mm7, %%mm0) | |
923 "movq 64(%1), %%mm1 \n\t" | |
924 MMABS_SUM(%%mm1, %%mm7, %%mm0) | |
925 "movq %%mm0, 64(%1) \n\t" | |
926 | |
927 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) | |
928 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7) | |
929 | |
930 HADAMARD48 | |
931 "movq %%mm7, (%1) \n\t" | |
932 MMABS(%%mm0, %%mm7) | |
933 MMABS_SUM(%%mm1, %%mm7, %%mm0) | |
934 MMABS_SUM(%%mm2, %%mm7, %%mm0) | |
935 MMABS_SUM(%%mm3, %%mm7, %%mm0) | |
936 MMABS_SUM(%%mm4, %%mm7, %%mm0) | |
937 MMABS_SUM(%%mm5, %%mm7, %%mm0) | |
938 MMABS_SUM(%%mm6, %%mm7, %%mm0) | |
939 "movq (%1), %%mm1 \n\t" | |
940 MMABS_SUM(%%mm1, %%mm7, %%mm0) | |
941 "movq 64(%1), %%mm1 \n\t" | |
942 MMABS_SUM(%%mm1, %%mm7, %%mm0) | |
943 | |
944 "movq %%mm0, %%mm1 \n\t" | |
945 "psrlq $32, %%mm0 \n\t" | |
946 "paddusw %%mm1, %%mm0 \n\t" | |
947 "movq %%mm0, %%mm1 \n\t" | |
948 "psrlq $16, %%mm0 \n\t" | |
949 "paddusw %%mm1, %%mm0 \n\t" | |
950 "movd %%mm0, %0 \n\t" | |
951 | |
952 : "=r" (sum) | |
953 : "r"(temp) | |
954 ); | |
955 return sum&0xFFFF; | |
956 } | |
957 | |
1708 | 958 static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ |
1153 | 959 uint64_t temp[16] __align8; |
960 int sum=0; | |
1708 | 961 |
962 assert(h==8); | |
1153 | 963 |
964 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); | |
965 | |
966 asm volatile( | |
967 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) | |
968 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) | |
969 | |
970 HADAMARD48 | |
971 | |
972 "movq %%mm7, 112(%1) \n\t" | |
973 | |
974 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) | |
975 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) | |
976 | |
977 "movq 112(%1), %%mm7 \n\t" | |
978 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) | |
979 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) | |
980 | |
981 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) | |
982 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) | |
983 | |
984 HADAMARD48 | |
985 | |
986 "movq %%mm7, 120(%1) \n\t" | |
987 | |
988 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) | |
989 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2) | |
990 | |
991 "movq 120(%1), %%mm7 \n\t" | |
992 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) | |
993 "movq %%mm7, %%mm5 \n\t"//FIXME remove | |
994 "movq %%mm6, %%mm7 \n\t" | |
995 "movq %%mm0, %%mm6 \n\t" | |
996 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove | |
997 | |
998 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3) | |
999 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) | |
1000 | |
1001 HADAMARD48 | |
1002 "movq %%mm7, 64(%1) \n\t" | |
1003 MMABS_MMX2(%%mm0, %%mm7) | |
1004 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) | |
1005 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0) | |
1006 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0) | |
1007 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0) | |
1008 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0) | |
1009 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0) | |
1010 "movq 64(%1), %%mm1 \n\t" | |
1011 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) | |
1012 "movq %%mm0, 64(%1) \n\t" | |
1013 | |
1014 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) | |
1015 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7) | |
1016 | |
1017 HADAMARD48 | |
1018 "movq %%mm7, (%1) \n\t" | |
1019 MMABS_MMX2(%%mm0, %%mm7) | |
1020 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) | |
1021 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0) | |
1022 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0) | |
1023 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0) | |
1024 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0) | |
1025 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0) | |
1026 "movq (%1), %%mm1 \n\t" | |
1027 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) | |
1028 "movq 64(%1), %%mm1 \n\t" | |
1029 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) | |
1030 | |
1031 "movq %%mm0, %%mm1 \n\t" | |
1032 "psrlq $32, %%mm0 \n\t" | |
1033 "paddusw %%mm1, %%mm0 \n\t" | |
1034 "movq %%mm0, %%mm1 \n\t" | |
1035 "psrlq $16, %%mm0 \n\t" | |
1036 "paddusw %%mm1, %%mm0 \n\t" | |
1037 "movd %%mm0, %0 \n\t" | |
1038 | |
1039 : "=r" (sum) | |
1040 : "r"(temp) | |
1041 ); | |
1042 return sum&0xFFFF; | |
1043 } | |
1044 | |
1045 | |
1708 | 1046 WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx) |
1047 WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2) | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
1048 #endif //CONFIG_ENCODERS |
866 | 1049 |
959 | 1050 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d) |
1051 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d) | |
1052 | |
954 | 1053 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ |
1054 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ | |
961 | 1055 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ |
954 | 1056 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ |
1057 "movq "#in7", " #m3 " \n\t" /* d */\ | |
1058 "movq "#in0", %%mm5 \n\t" /* D */\ | |
1059 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\ | |
1060 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\ | |
1061 "movq "#in1", %%mm5 \n\t" /* C */\ | |
1062 "movq "#in2", %%mm6 \n\t" /* B */\ | |
1063 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\ | |
1064 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ | |
1065 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ | |
1066 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ | |
961 | 1067 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ |
954 | 1068 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ |
1069 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ | |
1070 "psraw $5, %%mm5 \n\t"\ | |
1071 "packuswb %%mm5, %%mm5 \n\t"\ | |
1072 OP(%%mm5, out, %%mm7, d) | |
1073 | |
959 | 1074 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\ |
1057 | 1075 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
954 | 1076 uint64_t temp;\ |
1077 \ | |
1078 asm volatile(\ | |
1079 "pxor %%mm7, %%mm7 \n\t"\ | |
1080 "1: \n\t"\ | |
1081 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ | |
1082 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ | |
1083 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ | |
1084 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ | |
1085 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ | |
1086 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ | |
1087 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ | |
1088 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ | |
1089 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ | |
1090 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ | |
1091 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ | |
1092 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ | |
1093 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ | |
1094 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ | |
1095 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ | |
1096 "paddw %%mm3, %%mm5 \n\t" /* b */\ | |
1097 "paddw %%mm2, %%mm6 \n\t" /* c */\ | |
1098 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
1099 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ | |
1100 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ | |
961 | 1101 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ |
954 | 1102 "paddw %%mm4, %%mm0 \n\t" /* a */\ |
1103 "paddw %%mm1, %%mm5 \n\t" /* d */\ | |
961 | 1104 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ |
954 | 1105 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ |
961 | 1106 "paddw %6, %%mm6 \n\t"\ |
954 | 1107 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
1108 "psraw $5, %%mm0 \n\t"\ | |
961 | 1109 "movq %%mm0, %5 \n\t"\ |
954 | 1110 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ |
1111 \ | |
1112 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ | |
1113 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\ | |
1114 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\ | |
1115 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\ | |
1116 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\ | |
1117 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\ | |
1118 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\ | |
1119 "paddw %%mm0, %%mm2 \n\t" /* b */\ | |
1120 "paddw %%mm5, %%mm3 \n\t" /* c */\ | |
1121 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ | |
1122 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ | |
1123 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\ | |
1124 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ | |
1125 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ | |
1126 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ | |
961 | 1127 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ |
954 | 1128 "paddw %%mm2, %%mm1 \n\t" /* a */\ |
1129 "paddw %%mm6, %%mm4 \n\t" /* d */\ | |
961 | 1130 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ |
954 | 1131 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ |
961 | 1132 "paddw %6, %%mm1 \n\t"\ |
954 | 1133 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ |
1134 "psraw $5, %%mm3 \n\t"\ | |
961 | 1135 "movq %5, %%mm1 \n\t"\ |
954 | 1136 "packuswb %%mm3, %%mm1 \n\t"\ |
959 | 1137 OP_MMX2(%%mm1, (%1),%%mm4, q)\ |
954 | 1138 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ |
1139 \ | |
1140 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ | |
1141 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ | |
1142 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ | |
1143 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\ | |
1144 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\ | |
1145 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\ | |
1146 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\ | |
1147 "paddw %%mm1, %%mm5 \n\t" /* b */\ | |
1148 "paddw %%mm4, %%mm0 \n\t" /* c */\ | |
1149 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
1150 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ | |
1151 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ | |
1152 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ | |
961 | 1153 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ |
954 | 1154 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ |
1155 "paddw %%mm3, %%mm2 \n\t" /* d */\ | |
1156 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ | |
1157 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\ | |
1158 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ | |
1159 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ | |
1160 "paddw %%mm2, %%mm6 \n\t" /* a */\ | |
961 | 1161 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ |
1162 "paddw %6, %%mm0 \n\t"\ | |
954 | 1163 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
1164 "psraw $5, %%mm0 \n\t"\ | |
1165 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ | |
1166 \ | |
1167 "paddw %%mm5, %%mm3 \n\t" /* a */\ | |
1168 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\ | |
1169 "paddw %%mm4, %%mm6 \n\t" /* b */\ | |
1170 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\ | |
1171 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\ | |
1172 "paddw %%mm1, %%mm4 \n\t" /* c */\ | |
1173 "paddw %%mm2, %%mm5 \n\t" /* d */\ | |
1174 "paddw %%mm6, %%mm6 \n\t" /* 2b */\ | |
1175 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ | |
961 | 1176 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ |
1177 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ | |
954 | 1178 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ |
961 | 1179 "paddw %6, %%mm4 \n\t"\ |
954 | 1180 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ |
1181 "psraw $5, %%mm4 \n\t"\ | |
1182 "packuswb %%mm4, %%mm0 \n\t"\ | |
959 | 1183 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ |
954 | 1184 \ |
1185 "addl %3, %0 \n\t"\ | |
1186 "addl %4, %1 \n\t"\ | |
1187 "decl %2 \n\t"\ | |
1188 " jnz 1b \n\t"\ | |
967 | 1189 : "+a"(src), "+c"(dst), "+m"(h)\ |
966 | 1190 : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
1191 : "memory"\ | |
954 | 1192 );\ |
1193 }\ | |
1194 \ | |
1195 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
1196 int i;\ | |
1197 int16_t temp[16];\ | |
1198 /* quick HACK, XXX FIXME MUST be optimized */\ | |
1199 for(i=0; i<h; i++)\ | |
1200 {\ | |
1201 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ | |
1202 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ | |
1203 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ | |
1204 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ | |
1205 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ | |
1206 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\ | |
1207 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\ | |
1208 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\ | |
1209 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\ | |
1210 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\ | |
1211 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\ | |
1212 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\ | |
1213 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\ | |
1214 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\ | |
1215 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\ | |
1216 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\ | |
1217 asm volatile(\ | |
1218 "movq (%0), %%mm0 \n\t"\ | |
1219 "movq 8(%0), %%mm1 \n\t"\ | |
1220 "paddw %2, %%mm0 \n\t"\ | |
1221 "paddw %2, %%mm1 \n\t"\ | |
1222 "psraw $5, %%mm0 \n\t"\ | |
1223 "psraw $5, %%mm1 \n\t"\ | |
1224 "packuswb %%mm1, %%mm0 \n\t"\ | |
959 | 1225 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ |
954 | 1226 "movq 16(%0), %%mm0 \n\t"\ |
1227 "movq 24(%0), %%mm1 \n\t"\ | |
1228 "paddw %2, %%mm0 \n\t"\ | |
1229 "paddw %2, %%mm1 \n\t"\ | |
1230 "psraw $5, %%mm0 \n\t"\ | |
1231 "psraw $5, %%mm1 \n\t"\ | |
1232 "packuswb %%mm1, %%mm0 \n\t"\ | |
959 | 1233 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\ |
954 | 1234 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ |
966 | 1235 : "memory"\ |
954 | 1236 );\ |
1237 dst+=dstStride;\ | |
1238 src+=srcStride;\ | |
1239 }\ | |
1240 }\ | |
1241 \ | |
1057 | 1242 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
959 | 1243 uint64_t temp;\ |
1244 \ | |
1245 asm volatile(\ | |
1246 "pxor %%mm7, %%mm7 \n\t"\ | |
1247 "1: \n\t"\ | |
1248 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ | |
1249 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ | |
1250 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ | |
1251 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ | |
1252 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ | |
1253 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ | |
1254 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ | |
1255 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ | |
1256 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ | |
1257 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ | |
1258 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ | |
1259 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ | |
1260 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ | |
1261 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ | |
1262 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ | |
1263 "paddw %%mm3, %%mm5 \n\t" /* b */\ | |
1264 "paddw %%mm2, %%mm6 \n\t" /* c */\ | |
1265 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
1266 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ | |
1267 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ | |
961 | 1268 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ |
959 | 1269 "paddw %%mm4, %%mm0 \n\t" /* a */\ |
1270 "paddw %%mm1, %%mm5 \n\t" /* d */\ | |
961 | 1271 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ |
959 | 1272 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ |
961 | 1273 "paddw %6, %%mm6 \n\t"\ |
959 | 1274 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
1275 "psraw $5, %%mm0 \n\t"\ | |
1276 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ | |
1277 \ | |
1278 "movd 5(%0), %%mm5 \n\t" /* FGHI */\ | |
1279 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\ | |
1280 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\ | |
1281 "paddw %%mm5, %%mm1 \n\t" /* a */\ | |
1282 "paddw %%mm6, %%mm2 \n\t" /* b */\ | |
1283 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\ | |
1284 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\ | |
1285 "paddw %%mm6, %%mm3 \n\t" /* c */\ | |
1286 "paddw %%mm5, %%mm4 \n\t" /* d */\ | |
1287 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ | |
1288 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ | |
961 | 1289 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ |
1290 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ | |
959 | 1291 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ |
961 | 1292 "paddw %6, %%mm1 \n\t"\ |
959 | 1293 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ |
1294 "psraw $5, %%mm3 \n\t"\ | |
1295 "packuswb %%mm3, %%mm0 \n\t"\ | |
1296 OP_MMX2(%%mm0, (%1), %%mm4, q)\ | |
1297 \ | |
1298 "addl %3, %0 \n\t"\ | |
1299 "addl %4, %1 \n\t"\ | |
1300 "decl %2 \n\t"\ | |
961 | 1301 " jnz 1b \n\t"\ |
967 | 1302 : "+a"(src), "+c"(dst), "+m"(h)\ |
966 | 1303 : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
1304 : "memory"\ | |
959 | 1305 );\ |
1306 }\ | |
1307 \ | |
1308 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
1309 int i;\ | |
1310 int16_t temp[8];\ | |
1311 /* quick HACK, XXX FIXME MUST be optimized */\ | |
1312 for(i=0; i<h; i++)\ | |
1313 {\ | |
1314 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ | |
1315 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ | |
1316 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ | |
1317 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ | |
1318 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ | |
1319 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\ | |
1320 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\ | |
1321 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\ | |
1322 asm volatile(\ | |
1323 "movq (%0), %%mm0 \n\t"\ | |
1324 "movq 8(%0), %%mm1 \n\t"\ | |
1325 "paddw %2, %%mm0 \n\t"\ | |
1326 "paddw %2, %%mm1 \n\t"\ | |
1327 "psraw $5, %%mm0 \n\t"\ | |
1328 "psraw $5, %%mm1 \n\t"\ | |
1329 "packuswb %%mm1, %%mm0 \n\t"\ | |
1330 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ | |
1331 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ | |
966 | 1332 :"memory"\ |
959 | 1333 );\ |
1334 dst+=dstStride;\ | |
1335 src+=srcStride;\ | |
1336 }\ | |
1337 } | |
1338 | |
1339 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ | |
1340 \ | |
1341 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
954 | 1342 uint64_t temp[17*4];\ |
1343 uint64_t *temp_ptr= temp;\ | |
1344 int count= 17;\ | |
1345 \ | |
1346 /*FIXME unroll */\ | |
1347 asm volatile(\ | |
1348 "pxor %%mm7, %%mm7 \n\t"\ | |
1349 "1: \n\t"\ | |
1350 "movq (%0), %%mm0 \n\t"\ | |
1351 "movq (%0), %%mm1 \n\t"\ | |
1352 "movq 8(%0), %%mm2 \n\t"\ | |
1353 "movq 8(%0), %%mm3 \n\t"\ | |
1354 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1355 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
1356 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1357 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
1358 "movq %%mm0, (%1) \n\t"\ | |
1359 "movq %%mm1, 17*8(%1) \n\t"\ | |
967 | 1360 "movq %%mm2, 2*17*8(%1) \n\t"\ |
1361 "movq %%mm3, 3*17*8(%1) \n\t"\ | |
954 | 1362 "addl $8, %1 \n\t"\ |
1363 "addl %3, %0 \n\t"\ | |
1364 "decl %2 \n\t"\ | |
1365 " jnz 1b \n\t"\ | |
1366 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ | |
967 | 1367 : "r" (srcStride)\ |
966 | 1368 : "memory"\ |
954 | 1369 );\ |
1370 \ | |
1371 temp_ptr= temp;\ | |
1372 count=4;\ | |
1373 \ | |
1374 /*FIXME reorder for speed */\ | |
1375 asm volatile(\ | |
1376 /*"pxor %%mm7, %%mm7 \n\t"*/\ | |
1377 "1: \n\t"\ | |
1378 "movq (%0), %%mm0 \n\t"\ | |
1379 "movq 8(%0), %%mm1 \n\t"\ | |
1380 "movq 16(%0), %%mm2 \n\t"\ | |
1381 "movq 24(%0), %%mm3 \n\t"\ | |
961 | 1382 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
1383 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ | |
954 | 1384 "addl %4, %1 \n\t"\ |
961 | 1385 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
954 | 1386 \ |
961 | 1387 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
954 | 1388 "addl %4, %1 \n\t"\ |
961 | 1389 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
1390 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ | |
954 | 1391 "addl %4, %1 \n\t"\ |
961 | 1392 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ |
1393 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ | |
954 | 1394 "addl %4, %1 \n\t"\ |
961 | 1395 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ |
1396 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ | |
954 | 1397 "addl %4, %1 \n\t"\ |
961 | 1398 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ |
1399 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ | |
954 | 1400 "addl %4, %1 \n\t"\ |
961 | 1401 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ |
954 | 1402 \ |
961 | 1403 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ |
954 | 1404 "addl %4, %1 \n\t" \ |
961 | 1405 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ |
1406 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ | |
954 | 1407 \ |
1408 "addl $136, %0 \n\t"\ | |
961 | 1409 "addl %6, %1 \n\t"\ |
954 | 1410 "decl %2 \n\t"\ |
1411 " jnz 1b \n\t"\ | |
958
9bb668034ecf
slowdown / gcc 2.95.* bug workaround (this should be reversed as soon as gcc 2.95.* support is droped)
michaelni
parents:
954
diff
changeset
|
1412 \ |
967 | 1413 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
961 | 1414 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\ |
966 | 1415 :"memory"\ |
954 | 1416 );\ |
1417 }\ | |
1418 \ | |
1057 | 1419 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
954 | 1420 uint64_t temp[9*4];\ |
1421 uint64_t *temp_ptr= temp;\ | |
1422 int count= 9;\ | |
1423 \ | |
1424 /*FIXME unroll */\ | |
1425 asm volatile(\ | |
1426 "pxor %%mm7, %%mm7 \n\t"\ | |
1427 "1: \n\t"\ | |
1428 "movq (%0), %%mm0 \n\t"\ | |
1429 "movq (%0), %%mm1 \n\t"\ | |
1430 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1431 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
1432 "movq %%mm0, (%1) \n\t"\ | |
1433 "movq %%mm1, 9*8(%1) \n\t"\ | |
1434 "addl $8, %1 \n\t"\ | |
1435 "addl %3, %0 \n\t"\ | |
1436 "decl %2 \n\t"\ | |
1437 " jnz 1b \n\t"\ | |
1438 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ | |
1439 : "r" (srcStride)\ | |
966 | 1440 : "memory"\ |
954 | 1441 );\ |
1442 \ | |
1443 temp_ptr= temp;\ | |
1444 count=2;\ | |
1445 \ | |
1446 /*FIXME reorder for speed */\ | |
1447 asm volatile(\ | |
1448 /*"pxor %%mm7, %%mm7 \n\t"*/\ | |
1449 "1: \n\t"\ | |
1450 "movq (%0), %%mm0 \n\t"\ | |
1451 "movq 8(%0), %%mm1 \n\t"\ | |
1452 "movq 16(%0), %%mm2 \n\t"\ | |
1453 "movq 24(%0), %%mm3 \n\t"\ | |
961 | 1454 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
1455 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ | |
954 | 1456 "addl %4, %1 \n\t"\ |
961 | 1457 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
954 | 1458 \ |
961 | 1459 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
954 | 1460 "addl %4, %1 \n\t"\ |
961 | 1461 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
954 | 1462 \ |
961 | 1463 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ |
954 | 1464 "addl %4, %1 \n\t"\ |
961 | 1465 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ |
1466 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ | |
954 | 1467 \ |
1468 "addl $72, %0 \n\t"\ | |
961 | 1469 "addl %6, %1 \n\t"\ |
954 | 1470 "decl %2 \n\t"\ |
1471 " jnz 1b \n\t"\ | |
1472 \ | |
961 | 1473 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
1474 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\ | |
966 | 1475 : "memory"\ |
1476 );\ | |
959 | 1477 }\ |
954 | 1478 \ |
1064 | 1479 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ |
959 | 1480 OPNAME ## pixels8_mmx(dst, src, stride, 8);\ |
954 | 1481 }\ |
1482 \ | |
1064 | 1483 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1484 uint64_t temp[8];\ |
954 | 1485 uint8_t * const half= (uint8_t*)temp;\ |
1486 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ | |
1487 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\ | |
1488 }\ | |
1489 \ | |
1064 | 1490 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 1491 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ |
1492 }\ | |
1493 \ | |
1064 | 1494 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1495 uint64_t temp[8];\ |
954 | 1496 uint8_t * const half= (uint8_t*)temp;\ |
1497 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ | |
1498 OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\ | |
1499 }\ | |
1500 \ | |
1064 | 1501 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1502 uint64_t temp[8];\ |
954 | 1503 uint8_t * const half= (uint8_t*)temp;\ |
959 | 1504 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ |
954 | 1505 OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\ |
1506 }\ | |
1507 \ | |
1064 | 1508 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
959 | 1509 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ |
954 | 1510 }\ |
1511 \ | |
1064 | 1512 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1513 uint64_t temp[8];\ |
954 | 1514 uint8_t * const half= (uint8_t*)temp;\ |
959 | 1515 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ |
954 | 1516 OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\ |
1517 }\ | |
1064 | 1518 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1519 uint64_t half[8 + 9];\ |
1520 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
1521 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 1522 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
984 | 1523 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\ |
959 | 1524 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
984 | 1525 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\ |
954 | 1526 }\ |
1064 | 1527 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1528 uint64_t half[8 + 9];\ |
1529 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
1530 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 1531 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
984 | 1532 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\ |
959 | 1533 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
984 | 1534 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\ |
954 | 1535 }\ |
1064 | 1536 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1537 uint64_t half[8 + 9];\ |
1538 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
1539 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 1540 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
984 | 1541 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\ |
959 | 1542 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
984 | 1543 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\ |
954 | 1544 }\ |
1064 | 1545 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1546 uint64_t half[8 + 9];\ |
1547 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
1548 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1549 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1550 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\ | |
959 | 1551 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
984 | 1552 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\ |
954 | 1553 }\ |
1064 | 1554 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1555 uint64_t half[8 + 9];\ |
954 | 1556 uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
1557 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1558 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
959 | 1559 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
954 | 1560 OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\ |
1561 }\ | |
1064 | 1562 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1563 uint64_t half[8 + 9];\ |
954 | 1564 uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
1565 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1566 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
959 | 1567 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
954 | 1568 OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\ |
1569 }\ | |
1064 | 1570 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1571 uint64_t half[8 + 9];\ |
1572 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 1573 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
984 | 1574 put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\ |
1575 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ | |
954 | 1576 }\ |
1064 | 1577 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1578 uint64_t half[8 + 9];\ |
1579 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 1580 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
984 | 1581 put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\ |
1582 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ | |
954 | 1583 }\ |
1064 | 1584 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1585 uint64_t half[9];\ |
954 | 1586 uint8_t * const halfH= ((uint8_t*)half);\ |
1587 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
959 | 1588 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
954 | 1589 }\ |
1064 | 1590 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ |
959 | 1591 OPNAME ## pixels16_mmx(dst, src, stride, 16);\ |
954 | 1592 }\ |
1593 \ | |
1064 | 1594 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 1595 uint64_t temp[32];\ |
1596 uint8_t * const half= (uint8_t*)temp;\ | |
1597 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ | |
1598 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\ | |
1599 }\ | |
1600 \ | |
1064 | 1601 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 1602 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ |
1603 }\ | |
1604 \ | |
1064 | 1605 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 1606 uint64_t temp[32];\ |
1607 uint8_t * const half= (uint8_t*)temp;\ | |
1608 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ | |
1609 OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\ | |
1610 }\ | |
1611 \ | |
1064 | 1612 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 1613 uint64_t temp[32];\ |
1614 uint8_t * const half= (uint8_t*)temp;\ | |
959 | 1615 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ |
954 | 1616 OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\ |
1617 }\ | |
1618 \ | |
1064 | 1619 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
959 | 1620 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ |
954 | 1621 }\ |
1622 \ | |
1064 | 1623 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 1624 uint64_t temp[32];\ |
1625 uint8_t * const half= (uint8_t*)temp;\ | |
959 | 1626 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ |
954 | 1627 OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\ |
1628 }\ | |
1064 | 1629 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1630 uint64_t half[16*2 + 17*2];\ |
1631 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1632 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 1633 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
984 | 1634 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\ |
959 | 1635 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
984 | 1636 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\ |
954 | 1637 }\ |
1064 | 1638 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1639 uint64_t half[16*2 + 17*2];\ |
1640 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1641 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 1642 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
984 | 1643 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\ |
959 | 1644 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
984 | 1645 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\ |
954 | 1646 }\ |
1064 | 1647 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1648 uint64_t half[16*2 + 17*2];\ |
1649 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1650 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 1651 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
984 | 1652 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\ |
959 | 1653 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
984 | 1654 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\ |
954 | 1655 }\ |
1064 | 1656 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1657 uint64_t half[16*2 + 17*2];\ |
1658 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1659 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1660 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1661 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\ | |
959 | 1662 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
984 | 1663 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\ |
954 | 1664 }\ |
1064 | 1665 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 1666 uint64_t half[16*2 + 17*2];\ |
1667 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1668 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1669 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
959 | 1670 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
954 | 1671 OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\ |
1672 }\ | |
1064 | 1673 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 1674 uint64_t half[16*2 + 17*2];\ |
1675 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1676 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1677 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
959 | 1678 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
954 | 1679 OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\ |
1680 }\ | |
1064 | 1681 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1682 uint64_t half[17*2];\ |
1683 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 1684 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
984 | 1685 put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\ |
1686 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ | |
954 | 1687 }\ |
1064 | 1688 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 1689 uint64_t half[17*2];\ |
1690 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 1691 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
984 | 1692 put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\ |
1693 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ | |
954 | 1694 }\ |
1064 | 1695 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 1696 uint64_t half[17*2];\ |
1697 uint8_t * const halfH= ((uint8_t*)half);\ | |
1698 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
959 | 1699 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
954 | 1700 } |
1701 | |
1702 | |
1703 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" | |
959 | 1704 #define AVG_3DNOW_OP(a,b,temp, size) \ |
954 | 1705 "mov" #size " " #b ", " #temp " \n\t"\ |
1706 "pavgusb " #temp ", " #a " \n\t"\ | |
1707 "mov" #size " " #a ", " #b " \n\t" | |
959 | 1708 #define AVG_MMX2_OP(a,b,temp, size) \ |
954 | 1709 "mov" #size " " #b ", " #temp " \n\t"\ |
1710 "pavgb " #temp ", " #a " \n\t"\ | |
1711 "mov" #size " " #a ", " #b " \n\t" | |
959 | 1712 |
1713 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP) | |
1714 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP) | |
1715 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) | |
1716 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow) | |
1717 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) | |
1718 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) | |
954 | 1719 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) |
959 | 1720 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) |
954 | 1721 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) |
1722 | |
393 | 1723 #if 0 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1724 static void just_return() { return; } |
393 | 1725 #endif |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
1726 |
954 | 1727 #define SET_QPEL_FUNC(postfix1, postfix2) \ |
1728 c->put_ ## postfix1 = put_ ## postfix2;\ | |
1729 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\ | |
1730 c->avg_ ## postfix1 = avg_ ## postfix2; | |
1092 | 1731 |
1732 /* external functions, from idct_mmx.c */ | |
1733 void ff_mmx_idct(DCTELEM *block); | |
1734 void ff_mmxext_idct(DCTELEM *block); | |
1735 | |
1736 /* XXX: those functions should be suppressed ASAP when all IDCTs are | |
1737 converted */ | |
1738 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) | |
1739 { | |
1740 ff_mmx_idct (block); | |
1741 put_pixels_clamped_mmx(block, dest, line_size); | |
1742 } | |
1743 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
1744 { | |
1745 ff_mmx_idct (block); | |
1746 add_pixels_clamped_mmx(block, dest, line_size); | |
1747 } | |
1748 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block) | |
1749 { | |
1750 ff_mmxext_idct (block); | |
1751 put_pixels_clamped_mmx(block, dest, line_size); | |
1752 } | |
1753 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
1754 { | |
1755 ff_mmxext_idct (block); | |
1756 add_pixels_clamped_mmx(block, dest, line_size); | |
1757 } | |
954 | 1758 |
1092 | 1759 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
0 | 1760 { |
1761 mm_flags = mm_support(); | |
1115 | 1762 |
1122 | 1763 if (avctx->dsp_mask) { |
1764 if (avctx->dsp_mask & FF_MM_FORCE) | |
1765 mm_flags |= (avctx->dsp_mask & 0xffff); | |
1766 else | |
1767 mm_flags &= ~(avctx->dsp_mask & 0xffff); | |
1768 } | |
1115 | 1769 |
631
47a8964ba5cd
be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents:
629
diff
changeset
|
1770 #if 0 |
47a8964ba5cd
be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents:
629
diff
changeset
|
1771 fprintf(stderr, "libavcodec: CPU flags:"); |
0 | 1772 if (mm_flags & MM_MMX) |
631
47a8964ba5cd
be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents:
629
diff
changeset
|
1773 fprintf(stderr, " mmx"); |
0 | 1774 if (mm_flags & MM_MMXEXT) |
631
47a8964ba5cd
be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents:
629
diff
changeset
|
1775 fprintf(stderr, " mmxext"); |
0 | 1776 if (mm_flags & MM_3DNOW) |
631
47a8964ba5cd
be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents:
629
diff
changeset
|
1777 fprintf(stderr, " 3dnow"); |
0 | 1778 if (mm_flags & MM_SSE) |
631
47a8964ba5cd
be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents:
629
diff
changeset
|
1779 fprintf(stderr, " sse"); |
0 | 1780 if (mm_flags & MM_SSE2) |
631
47a8964ba5cd
be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents:
629
diff
changeset
|
1781 fprintf(stderr, " sse2"); |
47a8964ba5cd
be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents:
629
diff
changeset
|
1782 fprintf(stderr, "\n"); |
0 | 1783 #endif |
1784 | |
1785 if (mm_flags & MM_MMX) { | |
1092 | 1786 const int dct_algo = avctx->dct_algo; |
1787 const int idct_algo= avctx->idct_algo; | |
1788 | |
1232
e88d3b1fb2a1
more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents:
1186
diff
changeset
|
1789 #ifdef CONFIG_ENCODERS |
1565 | 1790 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ |
1791 if(mm_flags & MM_MMXEXT){ | |
1792 c->fdct = ff_fdct_mmx2; | |
1793 }else{ | |
1794 c->fdct = ff_fdct_mmx; | |
1795 } | |
1796 } | |
1232
e88d3b1fb2a1
more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents:
1186
diff
changeset
|
1797 #endif //CONFIG_ENCODERS |
1092 | 1798 |
1799 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ | |
1800 c->idct_put= ff_simple_idct_put_mmx; | |
1801 c->idct_add= ff_simple_idct_add_mmx; | |
1324
7d328fd9d8a5
the return of the idct with 16bit output by ("Ivan Kalvachev" <ivan at cacad dot com>)
michaelni
parents:
1232
diff
changeset
|
1802 c->idct = ff_simple_idct_mmx; |
1092 | 1803 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; |
1804 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ | |
1805 if(mm_flags & MM_MMXEXT){ | |
1806 c->idct_put= ff_libmpeg2mmx2_idct_put; | |
1807 c->idct_add= ff_libmpeg2mmx2_idct_add; | |
1324
7d328fd9d8a5
the return of the idct with 16bit output by ("Ivan Kalvachev" <ivan at cacad dot com>)
michaelni
parents:
1232
diff
changeset
|
1808 c->idct = ff_mmxext_idct; |
1092 | 1809 }else{ |
1810 c->idct_put= ff_libmpeg2mmx_idct_put; | |
1811 c->idct_add= ff_libmpeg2mmx_idct_add; | |
1324
7d328fd9d8a5
the return of the idct with 16bit output by ("Ivan Kalvachev" <ivan at cacad dot com>)
michaelni
parents:
1232
diff
changeset
|
1812 c->idct = ff_mmx_idct; |
1092 | 1813 } |
1814 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; | |
1815 } | |
1816 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
1817 #ifdef CONFIG_ENCODERS |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1818 c->get_pixels = get_pixels_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1819 c->diff_pixels = diff_pixels_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
1820 #endif //CONFIG_ENCODERS |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1821 c->put_pixels_clamped = put_pixels_clamped_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1822 c->add_pixels_clamped = add_pixels_clamped_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1823 c->clear_blocks = clear_blocks_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
1824 #ifdef CONFIG_ENCODERS |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1825 c->pix_sum = pix_sum16_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
1826 #endif //CONFIG_ENCODERS |
415 | 1827 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1828 c->put_pixels_tab[0][0] = put_pixels16_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1829 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1830 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1831 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx; |
0 | 1832 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1833 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1834 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1835 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1836 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx; |
651 | 1837 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1838 c->avg_pixels_tab[0][0] = avg_pixels16_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1839 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1840 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1841 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx; |
415 | 1842 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1843 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1844 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1845 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1846 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1847 |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1848 c->put_pixels_tab[1][0] = put_pixels8_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1849 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1850 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1851 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx; |
0 | 1852 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1853 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1854 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1855 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1856 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx; |
651 | 1857 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1858 c->avg_pixels_tab[1][0] = avg_pixels8_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1859 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1860 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1861 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx; |
651 | 1862 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1863 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1864 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1865 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1866 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx; |
954 | 1867 |
866 | 1868 c->add_bytes= add_bytes_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
1869 #ifdef CONFIG_ENCODERS |
866 | 1870 c->diff_bytes= diff_bytes_mmx; |
936 | 1871 |
1872 c->hadamard8_diff[0]= hadamard8_diff16_mmx; | |
1873 c->hadamard8_diff[1]= hadamard8_diff_mmx; | |
1874 | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
1875 c->pix_norm1 = pix_norm1_mmx; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
1876 c->sse[0] = sse16_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
1877 #endif //CONFIG_ENCODERS |
1647 | 1878 |
1879 c->h263_v_loop_filter= h263_v_loop_filter_mmx; | |
1648 | 1880 c->h263_h_loop_filter= h263_h_loop_filter_mmx; |
936 | 1881 |
0 | 1882 if (mm_flags & MM_MMXEXT) { |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1883 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1884 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; |
651 | 1885 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1886 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1887 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1888 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; |
415 | 1889 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1890 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1891 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2; |
651 | 1892 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1893 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1894 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1895 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; |
1092 | 1896 |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
1897 #ifdef CONFIG_ENCODERS |
1153 | 1898 c->hadamard8_diff[0]= hadamard8_diff16_mmx2; |
1899 c->hadamard8_diff[1]= hadamard8_diff_mmx2; | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
1900 #endif //CONFIG_ENCODERS |
1153 | 1901 |
1092 | 1902 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
1903 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; | |
1904 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; | |
1905 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; | |
1906 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; | |
1907 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; | |
1908 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; | |
1909 } | |
959 | 1910 |
961 | 1911 #if 1 |
954 | 1912 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2) |
1913 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2) | |
1914 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2) | |
1915 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2) | |
1916 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2) | |
1917 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2) | |
1918 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2) | |
1919 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2) | |
1920 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2) | |
1921 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2) | |
1922 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2) | |
1923 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2) | |
1924 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2) | |
1925 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2) | |
1926 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2) | |
1927 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2) | |
1928 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2) | |
1929 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2) | |
1930 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2) | |
1931 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2) | |
1932 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2) | |
1933 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2) | |
1934 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2) | |
1935 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2) | |
1936 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2) | |
1937 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2) | |
1938 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2) | |
1939 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2) | |
1940 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2) | |
1941 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2) | |
1942 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2) | |
1943 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2) | |
961 | 1944 #endif |
1527 | 1945 |
1686
68abbec33289
Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents:
1648
diff
changeset
|
1946 #ifdef CONFIG_ENCODERS |
1527 | 1947 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; |
1686
68abbec33289
Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents:
1648
diff
changeset
|
1948 #endif //CONFIG_ENCODERS |
0 | 1949 } else if (mm_flags & MM_3DNOW) { |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1950 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1951 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; |
393 | 1952 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1953 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1954 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1955 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; |
651 | 1956 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1957 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1958 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1959 |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1960 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1961 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1962 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; |
1092 | 1963 |
1964 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
1965 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; | |
1966 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; | |
1967 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; | |
1968 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; | |
1969 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; | |
1970 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; | |
1971 } | |
984 | 1972 |
954 | 1973 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow) |
1974 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow) | |
1975 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow) | |
1976 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow) | |
1977 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow) | |
1978 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow) | |
1979 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow) | |
1980 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow) | |
1981 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow) | |
1982 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow) | |
1983 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow) | |
1984 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow) | |
1985 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow) | |
1986 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow) | |
1987 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow) | |
1988 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow) | |
1989 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow) | |
1990 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow) | |
1991 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow) | |
1992 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow) | |
1993 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow) | |
1994 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow) | |
1995 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow) | |
1996 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow) | |
1997 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow) | |
1998 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow) | |
1999 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow) | |
2000 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow) | |
2001 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow) | |
2002 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow) | |
2003 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow) | |
2004 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow) | |
0 | 2005 } |
2006 } | |
1092 | 2007 |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2008 #ifdef CONFIG_ENCODERS |
1092 | 2009 dsputil_init_pix_mmx(c, avctx); |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
2010 #endif //CONFIG_ENCODERS |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2011 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2012 // for speed testing |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2013 get_pixels = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2014 put_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2015 add_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2016 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2017 pix_abs16x16 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2018 pix_abs16x16_x2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2019 pix_abs16x16_y2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2020 pix_abs16x16_xy2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2021 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2022 put_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2023 put_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2024 put_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2025 put_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2026 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2027 put_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2028 put_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2029 put_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2030 put_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2031 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2032 avg_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2033 avg_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2034 avg_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2035 avg_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2036 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2037 avg_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2038 avg_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2039 avg_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2040 avg_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2041 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2042 //av_fdct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2043 //ff_idct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2044 #endif |
0 | 2045 } |