Mercurial > libavcodec.hg
annotate i386/dsputil_mmx_avg.h @ 5839:b098f7cfc478 libavcodec
WMV3 mode with FASTTX=0 seems to use standard IDCT instead of own 8x8 transform
author | kostya |
---|---|
date | Thu, 18 Oct 2007 16:10:36 +0000 |
parents | dfd34e7f243f |
children | ec49b2361300 |
rev | line source |
---|---|
0 | 1 /* |
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2 | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
1739
07a484280a82
copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents:
1064
diff
changeset
|
4 * Copyright (c) 2002-2004 Michael Niedermayer |
0 | 5 * |
5214 | 6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> | |
8 * and improved by Zdenek Kabelac <kabi@users.sf.net> | |
9 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
10 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
11 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
12 * FFmpeg is free software; you can redistribute it and/or |
429 | 13 * modify it under the terms of the GNU Lesser General Public |
14 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
15 * version 2.1 of the License, or (at your option) any later version. |
0 | 16 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
17 * FFmpeg is distributed in the hope that it will be useful, |
0 | 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
20 * Lesser General Public License for more details. | |
0 | 21 * |
429 | 22 * You should have received a copy of the GNU Lesser General Public |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
23 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
0 | 25 */ |
2967 | 26 |
5833
dfd34e7f243f
Explain why there are no multiple inclusion guards in these header files.
diego
parents:
5831
diff
changeset
|
27 /* This header intentionally has no multiple inclusion guards. It is meant to |
dfd34e7f243f
Explain why there are no multiple inclusion guards in these header files.
diego
parents:
5831
diff
changeset
|
28 * be included multiple times and generates different code depending on the |
dfd34e7f243f
Explain why there are no multiple inclusion guards in these header files.
diego
parents:
5831
diff
changeset
|
29 * value of certain #defines. */ |
dfd34e7f243f
Explain why there are no multiple inclusion guards in these header files.
diego
parents:
5831
diff
changeset
|
30 |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
31 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
32 clobber bug - now it will work with 2.95.2 and also with -fPIC |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
33 */ |
1064 | 34 static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 35 { |
386 | 36 __asm __volatile( |
2979 | 37 "lea (%3, %3), %%"REG_a" \n\t" |
38 "1: \n\t" | |
39 "movq (%1), %%mm0 \n\t" | |
40 "movq (%1, %3), %%mm1 \n\t" | |
41 PAVGB" 1(%1), %%mm0 \n\t" | |
42 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
43 "movq %%mm0, (%2) \n\t" | |
44 "movq %%mm1, (%2, %3) \n\t" | |
45 "add %%"REG_a", %1 \n\t" | |
46 "add %%"REG_a", %2 \n\t" | |
47 "movq (%1), %%mm0 \n\t" | |
48 "movq (%1, %3), %%mm1 \n\t" | |
49 PAVGB" 1(%1), %%mm0 \n\t" | |
50 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
51 "add %%"REG_a", %1 \n\t" | |
52 "movq %%mm0, (%2) \n\t" | |
53 "movq %%mm1, (%2, %3) \n\t" | |
54 "add %%"REG_a", %2 \n\t" | |
55 "subl $4, %0 \n\t" | |
56 "jnz 1b \n\t" | |
57 :"+g"(h), "+S"(pixels), "+D"(block) | |
58 :"r" ((long)line_size) | |
59 :"%"REG_a, "memory"); | |
441 | 60 } |
651 | 61 |
2209 | 62 static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
63 { | |
64 __asm __volatile( | |
2979 | 65 "testl $1, %0 \n\t" |
66 " jz 1f \n\t" | |
67 "movd (%1), %%mm0 \n\t" | |
68 "movd (%2), %%mm1 \n\t" | |
69 "add %4, %1 \n\t" | |
70 "add $4, %2 \n\t" | |
71 PAVGB" %%mm1, %%mm0 \n\t" | |
72 "movd %%mm0, (%3) \n\t" | |
73 "add %5, %3 \n\t" | |
74 "decl %0 \n\t" | |
75 "1: \n\t" | |
76 "movd (%1), %%mm0 \n\t" | |
77 "add %4, %1 \n\t" | |
78 "movd (%1), %%mm1 \n\t" | |
79 "movd (%2), %%mm2 \n\t" | |
80 "movd 4(%2), %%mm3 \n\t" | |
81 "add %4, %1 \n\t" | |
82 PAVGB" %%mm2, %%mm0 \n\t" | |
83 PAVGB" %%mm3, %%mm1 \n\t" | |
84 "movd %%mm0, (%3) \n\t" | |
85 "add %5, %3 \n\t" | |
86 "movd %%mm1, (%3) \n\t" | |
87 "add %5, %3 \n\t" | |
88 "movd (%1), %%mm0 \n\t" | |
89 "add %4, %1 \n\t" | |
90 "movd (%1), %%mm1 \n\t" | |
91 "movd 8(%2), %%mm2 \n\t" | |
92 "movd 12(%2), %%mm3 \n\t" | |
93 "add %4, %1 \n\t" | |
94 PAVGB" %%mm2, %%mm0 \n\t" | |
95 PAVGB" %%mm3, %%mm1 \n\t" | |
96 "movd %%mm0, (%3) \n\t" | |
97 "add %5, %3 \n\t" | |
98 "movd %%mm1, (%3) \n\t" | |
99 "add %5, %3 \n\t" | |
100 "add $16, %2 \n\t" | |
101 "subl $4, %0 \n\t" | |
102 "jnz 1b \n\t" | |
5127 | 103 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
2979 | 104 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
2209 | 105 #else |
2979 | 106 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
2209 | 107 #endif |
2979 | 108 :"S"((long)src1Stride), "D"((long)dstStride) |
109 :"memory"); | |
2209 | 110 } |
111 | |
112 | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
113 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
954 | 114 { |
115 __asm __volatile( | |
2979 | 116 "testl $1, %0 \n\t" |
117 " jz 1f \n\t" | |
118 "movq (%1), %%mm0 \n\t" | |
119 "movq (%2), %%mm1 \n\t" | |
120 "add %4, %1 \n\t" | |
121 "add $8, %2 \n\t" | |
122 PAVGB" %%mm1, %%mm0 \n\t" | |
123 "movq %%mm0, (%3) \n\t" | |
124 "add %5, %3 \n\t" | |
125 "decl %0 \n\t" | |
126 "1: \n\t" | |
127 "movq (%1), %%mm0 \n\t" | |
128 "add %4, %1 \n\t" | |
129 "movq (%1), %%mm1 \n\t" | |
130 "add %4, %1 \n\t" | |
131 PAVGB" (%2), %%mm0 \n\t" | |
132 PAVGB" 8(%2), %%mm1 \n\t" | |
133 "movq %%mm0, (%3) \n\t" | |
134 "add %5, %3 \n\t" | |
135 "movq %%mm1, (%3) \n\t" | |
136 "add %5, %3 \n\t" | |
137 "movq (%1), %%mm0 \n\t" | |
138 "add %4, %1 \n\t" | |
139 "movq (%1), %%mm1 \n\t" | |
140 "add %4, %1 \n\t" | |
141 PAVGB" 16(%2), %%mm0 \n\t" | |
142 PAVGB" 24(%2), %%mm1 \n\t" | |
143 "movq %%mm0, (%3) \n\t" | |
144 "add %5, %3 \n\t" | |
145 "movq %%mm1, (%3) \n\t" | |
146 "add %5, %3 \n\t" | |
147 "add $32, %2 \n\t" | |
148 "subl $4, %0 \n\t" | |
149 "jnz 1b \n\t" | |
5127 | 150 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
2979 | 151 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
152 #else |
2979 | 153 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
154 #endif |
2979 | 155 :"S"((long)src1Stride), "D"((long)dstStride) |
156 :"memory"); | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
157 //the following should be used, though better not with gcc ... |
2979 | 158 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
159 :"r"(src1Stride), "r"(dstStride) | |
160 :"memory");*/ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
161 } |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
162 |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
163 static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
164 { |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
165 __asm __volatile( |
2979 | 166 "pcmpeqb %%mm6, %%mm6 \n\t" |
167 "testl $1, %0 \n\t" | |
168 " jz 1f \n\t" | |
169 "movq (%1), %%mm0 \n\t" | |
170 "movq (%2), %%mm1 \n\t" | |
171 "add %4, %1 \n\t" | |
172 "add $8, %2 \n\t" | |
173 "pxor %%mm6, %%mm0 \n\t" | |
174 "pxor %%mm6, %%mm1 \n\t" | |
175 PAVGB" %%mm1, %%mm0 \n\t" | |
176 "pxor %%mm6, %%mm0 \n\t" | |
177 "movq %%mm0, (%3) \n\t" | |
178 "add %5, %3 \n\t" | |
179 "decl %0 \n\t" | |
180 "1: \n\t" | |
181 "movq (%1), %%mm0 \n\t" | |
182 "add %4, %1 \n\t" | |
183 "movq (%1), %%mm1 \n\t" | |
184 "add %4, %1 \n\t" | |
185 "movq (%2), %%mm2 \n\t" | |
186 "movq 8(%2), %%mm3 \n\t" | |
187 "pxor %%mm6, %%mm0 \n\t" | |
188 "pxor %%mm6, %%mm1 \n\t" | |
189 "pxor %%mm6, %%mm2 \n\t" | |
190 "pxor %%mm6, %%mm3 \n\t" | |
191 PAVGB" %%mm2, %%mm0 \n\t" | |
192 PAVGB" %%mm3, %%mm1 \n\t" | |
193 "pxor %%mm6, %%mm0 \n\t" | |
194 "pxor %%mm6, %%mm1 \n\t" | |
195 "movq %%mm0, (%3) \n\t" | |
196 "add %5, %3 \n\t" | |
197 "movq %%mm1, (%3) \n\t" | |
198 "add %5, %3 \n\t" | |
199 "movq (%1), %%mm0 \n\t" | |
200 "add %4, %1 \n\t" | |
201 "movq (%1), %%mm1 \n\t" | |
202 "add %4, %1 \n\t" | |
203 "movq 16(%2), %%mm2 \n\t" | |
204 "movq 24(%2), %%mm3 \n\t" | |
205 "pxor %%mm6, %%mm0 \n\t" | |
206 "pxor %%mm6, %%mm1 \n\t" | |
207 "pxor %%mm6, %%mm2 \n\t" | |
208 "pxor %%mm6, %%mm3 \n\t" | |
209 PAVGB" %%mm2, %%mm0 \n\t" | |
210 PAVGB" %%mm3, %%mm1 \n\t" | |
211 "pxor %%mm6, %%mm0 \n\t" | |
212 "pxor %%mm6, %%mm1 \n\t" | |
213 "movq %%mm0, (%3) \n\t" | |
214 "add %5, %3 \n\t" | |
215 "movq %%mm1, (%3) \n\t" | |
216 "add %5, %3 \n\t" | |
217 "add $32, %2 \n\t" | |
218 "subl $4, %0 \n\t" | |
219 "jnz 1b \n\t" | |
5127 | 220 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
2979 | 221 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
222 #else |
2979 | 223 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
224 #endif |
2979 | 225 :"S"((long)src1Stride), "D"((long)dstStride) |
226 :"memory"); | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
227 //the following should be used, though better not with gcc ... |
2979 | 228 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
229 :"r"(src1Stride), "r"(dstStride) | |
230 :"memory");*/ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
231 } |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
232 |
2209 | 233 static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
234 { | |
235 __asm __volatile( | |
2979 | 236 "testl $1, %0 \n\t" |
237 " jz 1f \n\t" | |
238 "movd (%1), %%mm0 \n\t" | |
239 "movd (%2), %%mm1 \n\t" | |
240 "add %4, %1 \n\t" | |
241 "add $4, %2 \n\t" | |
242 PAVGB" %%mm1, %%mm0 \n\t" | |
243 PAVGB" (%3), %%mm0 \n\t" | |
244 "movd %%mm0, (%3) \n\t" | |
245 "add %5, %3 \n\t" | |
246 "decl %0 \n\t" | |
247 "1: \n\t" | |
248 "movd (%1), %%mm0 \n\t" | |
249 "add %4, %1 \n\t" | |
250 "movd (%1), %%mm1 \n\t" | |
251 "add %4, %1 \n\t" | |
252 PAVGB" (%2), %%mm0 \n\t" | |
253 PAVGB" 4(%2), %%mm1 \n\t" | |
254 PAVGB" (%3), %%mm0 \n\t" | |
255 "movd %%mm0, (%3) \n\t" | |
256 "add %5, %3 \n\t" | |
257 PAVGB" (%3), %%mm1 \n\t" | |
258 "movd %%mm1, (%3) \n\t" | |
259 "add %5, %3 \n\t" | |
260 "movd (%1), %%mm0 \n\t" | |
261 "add %4, %1 \n\t" | |
262 "movd (%1), %%mm1 \n\t" | |
263 "add %4, %1 \n\t" | |
264 PAVGB" 8(%2), %%mm0 \n\t" | |
265 PAVGB" 12(%2), %%mm1 \n\t" | |
266 PAVGB" (%3), %%mm0 \n\t" | |
267 "movd %%mm0, (%3) \n\t" | |
268 "add %5, %3 \n\t" | |
269 PAVGB" (%3), %%mm1 \n\t" | |
270 "movd %%mm1, (%3) \n\t" | |
271 "add %5, %3 \n\t" | |
272 "add $16, %2 \n\t" | |
273 "subl $4, %0 \n\t" | |
274 "jnz 1b \n\t" | |
5127 | 275 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
2979 | 276 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
2209 | 277 #else |
2979 | 278 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
2209 | 279 #endif |
2979 | 280 :"S"((long)src1Stride), "D"((long)dstStride) |
281 :"memory"); | |
2209 | 282 } |
283 | |
284 | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
285 static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
286 { |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
287 __asm __volatile( |
2979 | 288 "testl $1, %0 \n\t" |
289 " jz 1f \n\t" | |
290 "movq (%1), %%mm0 \n\t" | |
291 "movq (%2), %%mm1 \n\t" | |
292 "add %4, %1 \n\t" | |
293 "add $8, %2 \n\t" | |
294 PAVGB" %%mm1, %%mm0 \n\t" | |
295 PAVGB" (%3), %%mm0 \n\t" | |
296 "movq %%mm0, (%3) \n\t" | |
297 "add %5, %3 \n\t" | |
298 "decl %0 \n\t" | |
299 "1: \n\t" | |
300 "movq (%1), %%mm0 \n\t" | |
301 "add %4, %1 \n\t" | |
302 "movq (%1), %%mm1 \n\t" | |
303 "add %4, %1 \n\t" | |
304 PAVGB" (%2), %%mm0 \n\t" | |
305 PAVGB" 8(%2), %%mm1 \n\t" | |
306 PAVGB" (%3), %%mm0 \n\t" | |
307 "movq %%mm0, (%3) \n\t" | |
308 "add %5, %3 \n\t" | |
309 PAVGB" (%3), %%mm1 \n\t" | |
310 "movq %%mm1, (%3) \n\t" | |
311 "add %5, %3 \n\t" | |
312 "movq (%1), %%mm0 \n\t" | |
313 "add %4, %1 \n\t" | |
314 "movq (%1), %%mm1 \n\t" | |
315 "add %4, %1 \n\t" | |
316 PAVGB" 16(%2), %%mm0 \n\t" | |
317 PAVGB" 24(%2), %%mm1 \n\t" | |
318 PAVGB" (%3), %%mm0 \n\t" | |
319 "movq %%mm0, (%3) \n\t" | |
320 "add %5, %3 \n\t" | |
321 PAVGB" (%3), %%mm1 \n\t" | |
322 "movq %%mm1, (%3) \n\t" | |
323 "add %5, %3 \n\t" | |
324 "add $32, %2 \n\t" | |
325 "subl $4, %0 \n\t" | |
326 "jnz 1b \n\t" | |
5127 | 327 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
2979 | 328 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
329 #else |
2979 | 330 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
331 #endif |
2979 | 332 :"S"((long)src1Stride), "D"((long)dstStride) |
333 :"memory"); | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
334 //the following should be used, though better not with gcc ... |
2979 | 335 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
336 :"r"(src1Stride), "r"(dstStride) | |
337 :"memory");*/ | |
954 | 338 } |
339 | |
1064 | 340 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
651 | 341 { |
342 __asm __volatile( | |
2979 | 343 "lea (%3, %3), %%"REG_a" \n\t" |
344 "1: \n\t" | |
345 "movq (%1), %%mm0 \n\t" | |
346 "movq (%1, %3), %%mm1 \n\t" | |
347 "movq 8(%1), %%mm2 \n\t" | |
348 "movq 8(%1, %3), %%mm3 \n\t" | |
349 PAVGB" 1(%1), %%mm0 \n\t" | |
350 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
351 PAVGB" 9(%1), %%mm2 \n\t" | |
352 PAVGB" 9(%1, %3), %%mm3 \n\t" | |
353 "movq %%mm0, (%2) \n\t" | |
354 "movq %%mm1, (%2, %3) \n\t" | |
355 "movq %%mm2, 8(%2) \n\t" | |
356 "movq %%mm3, 8(%2, %3) \n\t" | |
357 "add %%"REG_a", %1 \n\t" | |
358 "add %%"REG_a", %2 \n\t" | |
359 "movq (%1), %%mm0 \n\t" | |
360 "movq (%1, %3), %%mm1 \n\t" | |
361 "movq 8(%1), %%mm2 \n\t" | |
362 "movq 8(%1, %3), %%mm3 \n\t" | |
363 PAVGB" 1(%1), %%mm0 \n\t" | |
364 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
365 PAVGB" 9(%1), %%mm2 \n\t" | |
366 PAVGB" 9(%1, %3), %%mm3 \n\t" | |
367 "add %%"REG_a", %1 \n\t" | |
368 "movq %%mm0, (%2) \n\t" | |
369 "movq %%mm1, (%2, %3) \n\t" | |
370 "movq %%mm2, 8(%2) \n\t" | |
371 "movq %%mm3, 8(%2, %3) \n\t" | |
372 "add %%"REG_a", %2 \n\t" | |
373 "subl $4, %0 \n\t" | |
374 "jnz 1b \n\t" | |
375 :"+g"(h), "+S"(pixels), "+D"(block) | |
376 :"r" ((long)line_size) | |
377 :"%"REG_a, "memory"); | |
651 | 378 } |
954 | 379 |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
380 static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
954 | 381 { |
382 __asm __volatile( | |
2979 | 383 "testl $1, %0 \n\t" |
384 " jz 1f \n\t" | |
385 "movq (%1), %%mm0 \n\t" | |
386 "movq 8(%1), %%mm1 \n\t" | |
387 PAVGB" (%2), %%mm0 \n\t" | |
388 PAVGB" 8(%2), %%mm1 \n\t" | |
389 "add %4, %1 \n\t" | |
390 "add $16, %2 \n\t" | |
391 "movq %%mm0, (%3) \n\t" | |
392 "movq %%mm1, 8(%3) \n\t" | |
393 "add %5, %3 \n\t" | |
394 "decl %0 \n\t" | |
395 "1: \n\t" | |
396 "movq (%1), %%mm0 \n\t" | |
397 "movq 8(%1), %%mm1 \n\t" | |
398 "add %4, %1 \n\t" | |
399 PAVGB" (%2), %%mm0 \n\t" | |
400 PAVGB" 8(%2), %%mm1 \n\t" | |
401 "movq %%mm0, (%3) \n\t" | |
402 "movq %%mm1, 8(%3) \n\t" | |
403 "add %5, %3 \n\t" | |
404 "movq (%1), %%mm0 \n\t" | |
405 "movq 8(%1), %%mm1 \n\t" | |
406 "add %4, %1 \n\t" | |
407 PAVGB" 16(%2), %%mm0 \n\t" | |
408 PAVGB" 24(%2), %%mm1 \n\t" | |
409 "movq %%mm0, (%3) \n\t" | |
410 "movq %%mm1, 8(%3) \n\t" | |
411 "add %5, %3 \n\t" | |
412 "add $32, %2 \n\t" | |
413 "subl $2, %0 \n\t" | |
414 "jnz 1b \n\t" | |
5127 | 415 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
2979 | 416 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
417 #else |
2979 | 418 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
419 #endif |
2979 | 420 :"S"((long)src1Stride), "D"((long)dstStride) |
421 :"memory"); | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
422 //the following should be used, though better not with gcc ... |
2979 | 423 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
424 :"r"(src1Stride), "r"(dstStride) | |
425 :"memory");*/ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
426 } |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
427 |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
428 static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
429 { |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
430 __asm __volatile( |
2979 | 431 "testl $1, %0 \n\t" |
432 " jz 1f \n\t" | |
433 "movq (%1), %%mm0 \n\t" | |
434 "movq 8(%1), %%mm1 \n\t" | |
435 PAVGB" (%2), %%mm0 \n\t" | |
436 PAVGB" 8(%2), %%mm1 \n\t" | |
437 "add %4, %1 \n\t" | |
438 "add $16, %2 \n\t" | |
439 PAVGB" (%3), %%mm0 \n\t" | |
440 PAVGB" 8(%3), %%mm1 \n\t" | |
441 "movq %%mm0, (%3) \n\t" | |
442 "movq %%mm1, 8(%3) \n\t" | |
443 "add %5, %3 \n\t" | |
444 "decl %0 \n\t" | |
445 "1: \n\t" | |
446 "movq (%1), %%mm0 \n\t" | |
447 "movq 8(%1), %%mm1 \n\t" | |
448 "add %4, %1 \n\t" | |
449 PAVGB" (%2), %%mm0 \n\t" | |
450 PAVGB" 8(%2), %%mm1 \n\t" | |
451 PAVGB" (%3), %%mm0 \n\t" | |
452 PAVGB" 8(%3), %%mm1 \n\t" | |
453 "movq %%mm0, (%3) \n\t" | |
454 "movq %%mm1, 8(%3) \n\t" | |
455 "add %5, %3 \n\t" | |
456 "movq (%1), %%mm0 \n\t" | |
457 "movq 8(%1), %%mm1 \n\t" | |
458 "add %4, %1 \n\t" | |
459 PAVGB" 16(%2), %%mm0 \n\t" | |
460 PAVGB" 24(%2), %%mm1 \n\t" | |
461 PAVGB" (%3), %%mm0 \n\t" | |
462 PAVGB" 8(%3), %%mm1 \n\t" | |
463 "movq %%mm0, (%3) \n\t" | |
464 "movq %%mm1, 8(%3) \n\t" | |
465 "add %5, %3 \n\t" | |
466 "add $32, %2 \n\t" | |
467 "subl $2, %0 \n\t" | |
468 "jnz 1b \n\t" | |
5127 | 469 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
2979 | 470 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
471 #else |
2979 | 472 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
473 #endif |
2979 | 474 :"S"((long)src1Stride), "D"((long)dstStride) |
475 :"memory"); | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
476 //the following should be used, though better not with gcc ... |
2979 | 477 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
478 :"r"(src1Stride), "r"(dstStride) | |
479 :"memory");*/ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
480 } |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
481 |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
482 static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
483 { |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
484 __asm __volatile( |
2979 | 485 "pcmpeqb %%mm6, %%mm6 \n\t" |
486 "testl $1, %0 \n\t" | |
487 " jz 1f \n\t" | |
488 "movq (%1), %%mm0 \n\t" | |
489 "movq 8(%1), %%mm1 \n\t" | |
490 "movq (%2), %%mm2 \n\t" | |
491 "movq 8(%2), %%mm3 \n\t" | |
492 "pxor %%mm6, %%mm0 \n\t" | |
493 "pxor %%mm6, %%mm1 \n\t" | |
494 "pxor %%mm6, %%mm2 \n\t" | |
495 "pxor %%mm6, %%mm3 \n\t" | |
496 PAVGB" %%mm2, %%mm0 \n\t" | |
497 PAVGB" %%mm3, %%mm1 \n\t" | |
498 "pxor %%mm6, %%mm0 \n\t" | |
499 "pxor %%mm6, %%mm1 \n\t" | |
500 "add %4, %1 \n\t" | |
501 "add $16, %2 \n\t" | |
502 "movq %%mm0, (%3) \n\t" | |
503 "movq %%mm1, 8(%3) \n\t" | |
504 "add %5, %3 \n\t" | |
505 "decl %0 \n\t" | |
506 "1: \n\t" | |
507 "movq (%1), %%mm0 \n\t" | |
508 "movq 8(%1), %%mm1 \n\t" | |
509 "add %4, %1 \n\t" | |
510 "movq (%2), %%mm2 \n\t" | |
511 "movq 8(%2), %%mm3 \n\t" | |
512 "pxor %%mm6, %%mm0 \n\t" | |
513 "pxor %%mm6, %%mm1 \n\t" | |
514 "pxor %%mm6, %%mm2 \n\t" | |
515 "pxor %%mm6, %%mm3 \n\t" | |
516 PAVGB" %%mm2, %%mm0 \n\t" | |
517 PAVGB" %%mm3, %%mm1 \n\t" | |
518 "pxor %%mm6, %%mm0 \n\t" | |
519 "pxor %%mm6, %%mm1 \n\t" | |
520 "movq %%mm0, (%3) \n\t" | |
521 "movq %%mm1, 8(%3) \n\t" | |
522 "add %5, %3 \n\t" | |
523 "movq (%1), %%mm0 \n\t" | |
524 "movq 8(%1), %%mm1 \n\t" | |
525 "add %4, %1 \n\t" | |
526 "movq 16(%2), %%mm2 \n\t" | |
527 "movq 24(%2), %%mm3 \n\t" | |
528 "pxor %%mm6, %%mm0 \n\t" | |
529 "pxor %%mm6, %%mm1 \n\t" | |
530 "pxor %%mm6, %%mm2 \n\t" | |
531 "pxor %%mm6, %%mm3 \n\t" | |
532 PAVGB" %%mm2, %%mm0 \n\t" | |
533 PAVGB" %%mm3, %%mm1 \n\t" | |
534 "pxor %%mm6, %%mm0 \n\t" | |
535 "pxor %%mm6, %%mm1 \n\t" | |
536 "movq %%mm0, (%3) \n\t" | |
537 "movq %%mm1, 8(%3) \n\t" | |
538 "add %5, %3 \n\t" | |
539 "add $32, %2 \n\t" | |
540 "subl $2, %0 \n\t" | |
541 "jnz 1b \n\t" | |
5127 | 542 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
2979 | 543 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
544 #else |
2979 | 545 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
546 #endif |
2979 | 547 :"S"((long)src1Stride), "D"((long)dstStride) |
548 :"memory"); | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
549 //the following should be used, though better not with gcc ... |
2979 | 550 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
551 :"r"(src1Stride), "r"(dstStride) | |
552 :"memory");*/ | |
954 | 553 } |
2967 | 554 |
441 | 555 /* GL: this function does incorrect rounding if overflow */ |
1064 | 556 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
441 | 557 { |
448 | 558 MOVQ_BONE(mm6); |
441 | 559 __asm __volatile( |
2979 | 560 "lea (%3, %3), %%"REG_a" \n\t" |
561 "1: \n\t" | |
562 "movq (%1), %%mm0 \n\t" | |
563 "movq (%1, %3), %%mm2 \n\t" | |
564 "movq 1(%1), %%mm1 \n\t" | |
565 "movq 1(%1, %3), %%mm3 \n\t" | |
566 "add %%"REG_a", %1 \n\t" | |
567 "psubusb %%mm6, %%mm0 \n\t" | |
568 "psubusb %%mm6, %%mm2 \n\t" | |
569 PAVGB" %%mm1, %%mm0 \n\t" | |
570 PAVGB" %%mm3, %%mm2 \n\t" | |
571 "movq %%mm0, (%2) \n\t" | |
572 "movq %%mm2, (%2, %3) \n\t" | |
573 "movq (%1), %%mm0 \n\t" | |
574 "movq 1(%1), %%mm1 \n\t" | |
575 "movq (%1, %3), %%mm2 \n\t" | |
576 "movq 1(%1, %3), %%mm3 \n\t" | |
577 "add %%"REG_a", %2 \n\t" | |
578 "add %%"REG_a", %1 \n\t" | |
579 "psubusb %%mm6, %%mm0 \n\t" | |
580 "psubusb %%mm6, %%mm2 \n\t" | |
581 PAVGB" %%mm1, %%mm0 \n\t" | |
582 PAVGB" %%mm3, %%mm2 \n\t" | |
583 "movq %%mm0, (%2) \n\t" | |
584 "movq %%mm2, (%2, %3) \n\t" | |
585 "add %%"REG_a", %2 \n\t" | |
586 "subl $4, %0 \n\t" | |
587 "jnz 1b \n\t" | |
588 :"+g"(h), "+S"(pixels), "+D"(block) | |
589 :"r" ((long)line_size) | |
590 :"%"REG_a, "memory"); | |
0 | 591 } |
592 | |
1064 | 593 static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 594 { |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
595 __asm __volatile( |
2979 | 596 "lea (%3, %3), %%"REG_a" \n\t" |
597 "movq (%1), %%mm0 \n\t" | |
598 "sub %3, %2 \n\t" | |
599 "1: \n\t" | |
600 "movq (%1, %3), %%mm1 \n\t" | |
601 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
602 "add %%"REG_a", %1 \n\t" | |
603 PAVGB" %%mm1, %%mm0 \n\t" | |
604 PAVGB" %%mm2, %%mm1 \n\t" | |
605 "movq %%mm0, (%2, %3) \n\t" | |
606 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
607 "movq (%1, %3), %%mm1 \n\t" | |
608 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
609 "add %%"REG_a", %2 \n\t" | |
610 "add %%"REG_a", %1 \n\t" | |
611 PAVGB" %%mm1, %%mm2 \n\t" | |
612 PAVGB" %%mm0, %%mm1 \n\t" | |
613 "movq %%mm2, (%2, %3) \n\t" | |
614 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
615 "add %%"REG_a", %2 \n\t" | |
616 "subl $4, %0 \n\t" | |
617 "jnz 1b \n\t" | |
618 :"+g"(h), "+S"(pixels), "+D" (block) | |
619 :"r" ((long)line_size) | |
620 :"%"REG_a, "memory"); | |
386 | 621 } |
622 | |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
623 /* GL: this function does incorrect rounding if overflow */ |
1064 | 624 static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
386 | 625 { |
448 | 626 MOVQ_BONE(mm6); |
0 | 627 __asm __volatile( |
2979 | 628 "lea (%3, %3), %%"REG_a" \n\t" |
629 "movq (%1), %%mm0 \n\t" | |
630 "sub %3, %2 \n\t" | |
631 "1: \n\t" | |
632 "movq (%1, %3), %%mm1 \n\t" | |
633 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
634 "add %%"REG_a", %1 \n\t" | |
635 "psubusb %%mm6, %%mm1 \n\t" | |
636 PAVGB" %%mm1, %%mm0 \n\t" | |
637 PAVGB" %%mm2, %%mm1 \n\t" | |
638 "movq %%mm0, (%2, %3) \n\t" | |
639 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
640 "movq (%1, %3), %%mm1 \n\t" | |
641 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
642 "add %%"REG_a", %2 \n\t" | |
643 "add %%"REG_a", %1 \n\t" | |
644 "psubusb %%mm6, %%mm1 \n\t" | |
645 PAVGB" %%mm1, %%mm2 \n\t" | |
646 PAVGB" %%mm0, %%mm1 \n\t" | |
647 "movq %%mm2, (%2, %3) \n\t" | |
648 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
649 "add %%"REG_a", %2 \n\t" | |
650 "subl $4, %0 \n\t" | |
651 "jnz 1b \n\t" | |
652 :"+g"(h), "+S"(pixels), "+D" (block) | |
653 :"r" ((long)line_size) | |
654 :"%"REG_a, "memory"); | |
0 | 655 } |
656 | |
1064 | 657 static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 658 { |
659 __asm __volatile( | |
2979 | 660 "lea (%3, %3), %%"REG_a" \n\t" |
661 "1: \n\t" | |
662 "movq (%2), %%mm0 \n\t" | |
663 "movq (%2, %3), %%mm1 \n\t" | |
664 PAVGB" (%1), %%mm0 \n\t" | |
665 PAVGB" (%1, %3), %%mm1 \n\t" | |
666 "movq %%mm0, (%2) \n\t" | |
667 "movq %%mm1, (%2, %3) \n\t" | |
668 "add %%"REG_a", %1 \n\t" | |
669 "add %%"REG_a", %2 \n\t" | |
670 "movq (%2), %%mm0 \n\t" | |
671 "movq (%2, %3), %%mm1 \n\t" | |
672 PAVGB" (%1), %%mm0 \n\t" | |
673 PAVGB" (%1, %3), %%mm1 \n\t" | |
674 "add %%"REG_a", %1 \n\t" | |
675 "movq %%mm0, (%2) \n\t" | |
676 "movq %%mm1, (%2, %3) \n\t" | |
677 "add %%"REG_a", %2 \n\t" | |
678 "subl $4, %0 \n\t" | |
679 "jnz 1b \n\t" | |
680 :"+g"(h), "+S"(pixels), "+D"(block) | |
681 :"r" ((long)line_size) | |
682 :"%"REG_a, "memory"); | |
0 | 683 } |
684 | |
1064 | 685 static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 686 { |
687 __asm __volatile( | |
2979 | 688 "lea (%3, %3), %%"REG_a" \n\t" |
689 "1: \n\t" | |
690 "movq (%1), %%mm0 \n\t" | |
691 "movq (%1, %3), %%mm2 \n\t" | |
692 PAVGB" 1(%1), %%mm0 \n\t" | |
693 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
694 PAVGB" (%2), %%mm0 \n\t" | |
695 PAVGB" (%2, %3), %%mm2 \n\t" | |
696 "add %%"REG_a", %1 \n\t" | |
697 "movq %%mm0, (%2) \n\t" | |
698 "movq %%mm2, (%2, %3) \n\t" | |
699 "movq (%1), %%mm0 \n\t" | |
700 "movq (%1, %3), %%mm2 \n\t" | |
701 PAVGB" 1(%1), %%mm0 \n\t" | |
702 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
703 "add %%"REG_a", %2 \n\t" | |
704 "add %%"REG_a", %1 \n\t" | |
705 PAVGB" (%2), %%mm0 \n\t" | |
706 PAVGB" (%2, %3), %%mm2 \n\t" | |
707 "movq %%mm0, (%2) \n\t" | |
708 "movq %%mm2, (%2, %3) \n\t" | |
709 "add %%"REG_a", %2 \n\t" | |
710 "subl $4, %0 \n\t" | |
711 "jnz 1b \n\t" | |
712 :"+g"(h), "+S"(pixels), "+D"(block) | |
713 :"r" ((long)line_size) | |
714 :"%"REG_a, "memory"); | |
0 | 715 } |
716 | |
1064 | 717 static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 718 { |
719 __asm __volatile( | |
2979 | 720 "lea (%3, %3), %%"REG_a" \n\t" |
721 "movq (%1), %%mm0 \n\t" | |
722 "sub %3, %2 \n\t" | |
723 "1: \n\t" | |
724 "movq (%1, %3), %%mm1 \n\t" | |
725 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
726 "add %%"REG_a", %1 \n\t" | |
727 PAVGB" %%mm1, %%mm0 \n\t" | |
728 PAVGB" %%mm2, %%mm1 \n\t" | |
729 "movq (%2, %3), %%mm3 \n\t" | |
730 "movq (%2, %%"REG_a"), %%mm4 \n\t" | |
731 PAVGB" %%mm3, %%mm0 \n\t" | |
732 PAVGB" %%mm4, %%mm1 \n\t" | |
733 "movq %%mm0, (%2, %3) \n\t" | |
734 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
735 "movq (%1, %3), %%mm1 \n\t" | |
736 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
737 PAVGB" %%mm1, %%mm2 \n\t" | |
738 PAVGB" %%mm0, %%mm1 \n\t" | |
739 "add %%"REG_a", %2 \n\t" | |
740 "add %%"REG_a", %1 \n\t" | |
741 "movq (%2, %3), %%mm3 \n\t" | |
742 "movq (%2, %%"REG_a"), %%mm4 \n\t" | |
743 PAVGB" %%mm3, %%mm2 \n\t" | |
744 PAVGB" %%mm4, %%mm1 \n\t" | |
745 "movq %%mm2, (%2, %3) \n\t" | |
746 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
747 "add %%"REG_a", %2 \n\t" | |
748 "subl $4, %0 \n\t" | |
749 "jnz 1b \n\t" | |
750 :"+g"(h), "+S"(pixels), "+D"(block) | |
751 :"r" ((long)line_size) | |
752 :"%"REG_a, "memory"); | |
0 | 753 } |
754 | |
5127 | 755 /* Note this is not correctly rounded, but this function is only |
756 * used for B-frames so it does not matter. */ | |
1064 | 757 static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 758 { |
448 | 759 MOVQ_BONE(mm6); |
0 | 760 __asm __volatile( |
2979 | 761 "lea (%3, %3), %%"REG_a" \n\t" |
762 "movq (%1), %%mm0 \n\t" | |
763 PAVGB" 1(%1), %%mm0 \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3036
diff
changeset
|
764 ASMALIGN(3) |
2979 | 765 "1: \n\t" |
766 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
767 "movq (%1, %3), %%mm1 \n\t" | |
768 "psubusb %%mm6, %%mm2 \n\t" | |
769 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
770 PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t" | |
771 "add %%"REG_a", %1 \n\t" | |
772 PAVGB" %%mm1, %%mm0 \n\t" | |
773 PAVGB" %%mm2, %%mm1 \n\t" | |
774 PAVGB" (%2), %%mm0 \n\t" | |
775 PAVGB" (%2, %3), %%mm1 \n\t" | |
776 "movq %%mm0, (%2) \n\t" | |
777 "movq %%mm1, (%2, %3) \n\t" | |
778 "movq (%1, %3), %%mm1 \n\t" | |
779 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
780 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
781 PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t" | |
782 "add %%"REG_a", %2 \n\t" | |
783 "add %%"REG_a", %1 \n\t" | |
784 PAVGB" %%mm1, %%mm2 \n\t" | |
785 PAVGB" %%mm0, %%mm1 \n\t" | |
786 PAVGB" (%2), %%mm2 \n\t" | |
787 PAVGB" (%2, %3), %%mm1 \n\t" | |
788 "movq %%mm2, (%2) \n\t" | |
789 "movq %%mm1, (%2, %3) \n\t" | |
790 "add %%"REG_a", %2 \n\t" | |
791 "subl $4, %0 \n\t" | |
792 "jnz 1b \n\t" | |
793 :"+g"(h), "+S"(pixels), "+D"(block) | |
794 :"r" ((long)line_size) | |
795 :"%"REG_a, "memory"); | |
0 | 796 } |
651 | 797 |
798 //FIXME the following could be optimized too ... | |
1064 | 799 static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 800 DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); |
801 DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); | |
802 } | |
1064 | 803 static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 804 DEF(put_pixels8_y2)(block , pixels , line_size, h); |
805 DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h); | |
806 } | |
1064 | 807 static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 808 DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h); |
809 DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); | |
810 } | |
1064 | 811 static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 812 DEF(avg_pixels8)(block , pixels , line_size, h); |
813 DEF(avg_pixels8)(block+8, pixels+8, line_size, h); | |
814 } | |
1064 | 815 static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 816 DEF(avg_pixels8_x2)(block , pixels , line_size, h); |
817 DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h); | |
818 } | |
1064 | 819 static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 820 DEF(avg_pixels8_y2)(block , pixels , line_size, h); |
821 DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h); | |
822 } | |
1064 | 823 static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 824 DEF(avg_pixels8_xy2)(block , pixels , line_size, h); |
825 DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); | |
826 } | |
827 | |
3807
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
828 #define QPEL_2TAP_L3(OPNAME) \ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
829 static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
830 asm volatile(\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
831 "1: \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
832 "movq (%1,%2), %%mm0 \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
833 "movq 8(%1,%2), %%mm1 \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
834 PAVGB" (%1,%3), %%mm0 \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
835 PAVGB" 8(%1,%3), %%mm1 \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
836 PAVGB" (%1), %%mm0 \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
837 PAVGB" 8(%1), %%mm1 \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
838 STORE_OP( (%1,%4),%%mm0)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
839 STORE_OP(8(%1,%4),%%mm1)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
840 "movq %%mm0, (%1,%4) \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
841 "movq %%mm1, 8(%1,%4) \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
842 "add %5, %1 \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
843 "decl %0 \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
844 "jnz 1b \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
845 :"+g"(h), "+r"(src)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
846 :"r"((long)off1), "r"((long)off2),\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
847 "r"((long)(dst-src)), "r"((long)stride)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
848 :"memory"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
849 );\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
850 }\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
851 static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
852 asm volatile(\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
853 "1: \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
854 "movq (%1,%2), %%mm0 \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
855 PAVGB" (%1,%3), %%mm0 \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
856 PAVGB" (%1), %%mm0 \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
857 STORE_OP((%1,%4),%%mm0)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
858 "movq %%mm0, (%1,%4) \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
859 "add %5, %1 \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
860 "decl %0 \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
861 "jnz 1b \n\t"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
862 :"+g"(h), "+r"(src)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
863 :"r"((long)off1), "r"((long)off2),\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
864 "r"((long)(dst-src)), "r"((long)stride)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
865 :"memory"\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
866 );\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
867 } |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
868 |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
869 #define STORE_OP(a,b) PAVGB" "#a","#b" \n\t" |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
870 QPEL_2TAP_L3(avg_) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
871 #undef STORE_OP |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
872 #define STORE_OP(a,b) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
873 QPEL_2TAP_L3(put_) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
874 #undef STORE_OP |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3576
diff
changeset
|
875 #undef QPEL_2TAP_L3 |