Mercurial > libavcodec.hg
annotate i386/dsputil_mmx_avg.h @ 2367:c353719836af libavcodec
fix some type mismatches patch by (Jeff Muizelaar <muizelaar rogers com>)
author | michael |
---|---|
date | Thu, 25 Nov 2004 19:17:27 +0000 |
parents | 15cfba1b97b5 |
children | 00f608ae3791 |
rev | line source |
---|---|
0 | 1 /* |
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2 | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
1739
07a484280a82
copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents:
1064
diff
changeset
|
4 * Copyright (c) 2002-2004 Michael Niedermayer |
0 | 5 * |
429 | 6 * This library is free software; you can redistribute it and/or |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2 of the License, or (at your option) any later version. | |
0 | 10 * |
429 | 11 * This library is distributed in the hope that it will be useful, |
0 | 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 * Lesser General Public License for more details. | |
0 | 15 * |
429 | 16 * You should have received a copy of the GNU Lesser General Public |
17 * License along with this library; if not, write to the Free Software | |
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
0 | 19 * |
20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
386 | 21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> |
441 | 22 * and improved by Zdenek Kabelac <kabi@users.sf.net> |
0 | 23 */ |
387 | 24 |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
25 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
26 clobber bug - now it will work with 2.95.2 and also with -fPIC |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
27 */ |
1064 | 28 static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 29 { |
386 | 30 __asm __volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
31 "lea (%3, %3), %%"REG_a" \n\t" |
441 | 32 "1: \n\t" |
33 "movq (%1), %%mm0 \n\t" | |
34 "movq (%1, %3), %%mm1 \n\t" | |
35 PAVGB" 1(%1), %%mm0 \n\t" | |
36 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
37 "movq %%mm0, (%2) \n\t" | |
38 "movq %%mm1, (%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
39 "add %%"REG_a", %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
40 "add %%"REG_a", %2 \n\t" |
441 | 41 "movq (%1), %%mm0 \n\t" |
42 "movq (%1, %3), %%mm1 \n\t" | |
43 PAVGB" 1(%1), %%mm0 \n\t" | |
44 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
45 "add %%"REG_a", %1 \n\t" |
441 | 46 "movq %%mm0, (%2) \n\t" |
47 "movq %%mm1, (%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
48 "add %%"REG_a", %2 \n\t" |
441 | 49 "subl $4, %0 \n\t" |
50 "jnz 1b \n\t" | |
51 :"+g"(h), "+S"(pixels), "+D"(block) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
52 :"r" ((long)line_size) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
53 :"%"REG_a, "memory"); |
441 | 54 } |
651 | 55 |
2209 | 56 static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
57 { | |
58 __asm __volatile( | |
59 "testl $1, %0 \n\t" | |
60 " jz 1f \n\t" | |
61 "movd (%1), %%mm0 \n\t" | |
62 "movd (%2), %%mm1 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
63 "add %4, %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
64 "add $4, %2 \n\t" |
2209 | 65 PAVGB" %%mm1, %%mm0 \n\t" |
66 "movd %%mm0, (%3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
67 "add %5, %3 \n\t" |
2209 | 68 "decl %0 \n\t" |
69 "1: \n\t" | |
70 "movd (%1), %%mm0 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
71 "add %4, %1 \n\t" |
2209 | 72 "movd (%1), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
73 "add %4, %1 \n\t" |
2209 | 74 PAVGB" (%2), %%mm0 \n\t" |
75 PAVGB" 4(%2), %%mm1 \n\t" | |
76 "movd %%mm0, (%3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
77 "add %5, %3 \n\t" |
2209 | 78 "movd %%mm1, (%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
79 "add %5, %3 \n\t" |
2209 | 80 "movd (%1), %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
81 "add %4, %1 \n\t" |
2209 | 82 "movd (%1), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
83 "add %4, %1 \n\t" |
2209 | 84 PAVGB" 8(%2), %%mm0 \n\t" |
85 PAVGB" 12(%2), %%mm1 \n\t" | |
86 "movd %%mm0, (%3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
87 "add %5, %3 \n\t" |
2209 | 88 "movd %%mm1, (%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
89 "add %5, %3 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
90 "add $16, %2 \n\t" |
2209 | 91 "subl $4, %0 \n\t" |
92 "jnz 1b \n\t" | |
93 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used | |
94 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
95 #else | |
96 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
97 #endif | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
98 :"S"((long)src1Stride), "D"((long)dstStride) |
2209 | 99 :"memory"); |
100 } | |
101 | |
102 | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
103 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
954 | 104 { |
105 __asm __volatile( | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
106 "testl $1, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
107 " jz 1f \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
108 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
109 "movq (%2), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
110 "add %4, %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
111 "add $8, %2 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
112 PAVGB" %%mm1, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
113 "movq %%mm0, (%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
114 "add %5, %3 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
115 "decl %0 \n\t" |
954 | 116 "1: \n\t" |
117 "movq (%1), %%mm0 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
118 "add %4, %1 \n\t" |
954 | 119 "movq (%1), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
120 "add %4, %1 \n\t" |
954 | 121 PAVGB" (%2), %%mm0 \n\t" |
122 PAVGB" 8(%2), %%mm1 \n\t" | |
123 "movq %%mm0, (%3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
124 "add %5, %3 \n\t" |
954 | 125 "movq %%mm1, (%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
126 "add %5, %3 \n\t" |
954 | 127 "movq (%1), %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
128 "add %4, %1 \n\t" |
954 | 129 "movq (%1), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
130 "add %4, %1 \n\t" |
954 | 131 PAVGB" 16(%2), %%mm0 \n\t" |
132 PAVGB" 24(%2), %%mm1 \n\t" | |
133 "movq %%mm0, (%3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
134 "add %5, %3 \n\t" |
954 | 135 "movq %%mm1, (%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
136 "add %5, %3 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
137 "add $32, %2 \n\t" |
954 | 138 "subl $4, %0 \n\t" |
139 "jnz 1b \n\t" | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
140 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
141 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
142 #else |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
143 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
144 #endif |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
145 :"S"((long)src1Stride), "D"((long)dstStride) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
146 :"memory"); |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
147 //the following should be used, though better not with gcc ... |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
148 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
954 | 149 :"r"(src1Stride), "r"(dstStride) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
150 :"memory");*/ |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
151 } |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
152 |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
153 static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
154 { |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
155 __asm __volatile( |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
156 "pcmpeqb %%mm6, %%mm6 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
157 "testl $1, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
158 " jz 1f \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
159 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
160 "movq (%2), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
161 "add %4, %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
162 "add $8, %2 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
163 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
164 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
165 PAVGB" %%mm1, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
166 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
167 "movq %%mm0, (%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
168 "add %5, %3 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
169 "decl %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
170 "1: \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
171 "movq (%1), %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
172 "add %4, %1 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
173 "movq (%1), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
174 "add %4, %1 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
175 "movq (%2), %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
176 "movq 8(%2), %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
177 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
178 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
179 "pxor %%mm6, %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
180 "pxor %%mm6, %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
181 PAVGB" %%mm2, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
182 PAVGB" %%mm3, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
183 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
184 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
185 "movq %%mm0, (%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
186 "add %5, %3 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
187 "movq %%mm1, (%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
188 "add %5, %3 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
189 "movq (%1), %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
190 "add %4, %1 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
191 "movq (%1), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
192 "add %4, %1 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
193 "movq 16(%2), %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
194 "movq 24(%2), %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
195 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
196 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
197 "pxor %%mm6, %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
198 "pxor %%mm6, %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
199 PAVGB" %%mm2, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
200 PAVGB" %%mm3, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
201 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
202 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
203 "movq %%mm0, (%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
204 "add %5, %3 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
205 "movq %%mm1, (%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
206 "add %5, %3 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
207 "add $32, %2 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
208 "subl $4, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
209 "jnz 1b \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
210 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
211 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
212 #else |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
213 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
214 #endif |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
215 :"S"((long)src1Stride), "D"((long)dstStride) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
216 :"memory"); |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
217 //the following should be used, though better not with gcc ... |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
218 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
219 :"r"(src1Stride), "r"(dstStride) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
220 :"memory");*/ |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
221 } |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
222 |
2209 | 223 static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
224 { | |
225 __asm __volatile( | |
226 "testl $1, %0 \n\t" | |
227 " jz 1f \n\t" | |
228 "movd (%1), %%mm0 \n\t" | |
229 "movd (%2), %%mm1 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
230 "add %4, %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
231 "add $4, %2 \n\t" |
2209 | 232 PAVGB" %%mm1, %%mm0 \n\t" |
233 PAVGB" (%3), %%mm0 \n\t" | |
234 "movd %%mm0, (%3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
235 "add %5, %3 \n\t" |
2209 | 236 "decl %0 \n\t" |
237 "1: \n\t" | |
238 "movd (%1), %%mm0 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
239 "add %4, %1 \n\t" |
2209 | 240 "movd (%1), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
241 "add %4, %1 \n\t" |
2209 | 242 PAVGB" (%2), %%mm0 \n\t" |
243 PAVGB" 4(%2), %%mm1 \n\t" | |
244 PAVGB" (%3), %%mm0 \n\t" | |
245 "movd %%mm0, (%3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
246 "add %5, %3 \n\t" |
2209 | 247 PAVGB" (%3), %%mm1 \n\t" |
248 "movd %%mm1, (%3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
249 "add %5, %3 \n\t" |
2209 | 250 "movd (%1), %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
251 "add %4, %1 \n\t" |
2209 | 252 "movd (%1), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
253 "add %4, %1 \n\t" |
2209 | 254 PAVGB" 8(%2), %%mm0 \n\t" |
255 PAVGB" 12(%2), %%mm1 \n\t" | |
256 PAVGB" (%3), %%mm0 \n\t" | |
257 "movd %%mm0, (%3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
258 "add %5, %3 \n\t" |
2209 | 259 PAVGB" (%3), %%mm1 \n\t" |
260 "movd %%mm1, (%3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
261 "add %5, %3 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
262 "add $16, %2 \n\t" |
2209 | 263 "subl $4, %0 \n\t" |
264 "jnz 1b \n\t" | |
265 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used | |
266 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
267 #else | |
268 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
269 #endif | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
270 :"S"((long)src1Stride), "D"((long)dstStride) |
2209 | 271 :"memory"); |
272 } | |
273 | |
274 | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
275 static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
276 { |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
277 __asm __volatile( |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
278 "testl $1, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
279 " jz 1f \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
280 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
281 "movq (%2), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
282 "add %4, %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
283 "add $8, %2 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
284 PAVGB" %%mm1, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
285 PAVGB" (%3), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
286 "movq %%mm0, (%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
287 "add %5, %3 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
288 "decl %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
289 "1: \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
290 "movq (%1), %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
291 "add %4, %1 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
292 "movq (%1), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
293 "add %4, %1 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
294 PAVGB" (%2), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
295 PAVGB" 8(%2), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
296 PAVGB" (%3), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
297 "movq %%mm0, (%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
298 "add %5, %3 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
299 PAVGB" (%3), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
300 "movq %%mm1, (%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
301 "add %5, %3 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
302 "movq (%1), %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
303 "add %4, %1 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
304 "movq (%1), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
305 "add %4, %1 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
306 PAVGB" 16(%2), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
307 PAVGB" 24(%2), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
308 PAVGB" (%3), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
309 "movq %%mm0, (%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
310 "add %5, %3 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
311 PAVGB" (%3), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
312 "movq %%mm1, (%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
313 "add %5, %3 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
314 "add $32, %2 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
315 "subl $4, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
316 "jnz 1b \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
317 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
318 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
319 #else |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
320 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
321 #endif |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
322 :"S"((long)src1Stride), "D"((long)dstStride) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
323 :"memory"); |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
324 //the following should be used, though better not with gcc ... |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
325 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
326 :"r"(src1Stride), "r"(dstStride) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
327 :"memory");*/ |
954 | 328 } |
329 | |
1064 | 330 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
651 | 331 { |
332 __asm __volatile( | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
333 "lea (%3, %3), %%"REG_a" \n\t" |
651 | 334 "1: \n\t" |
335 "movq (%1), %%mm0 \n\t" | |
336 "movq (%1, %3), %%mm1 \n\t" | |
337 "movq 8(%1), %%mm2 \n\t" | |
338 "movq 8(%1, %3), %%mm3 \n\t" | |
339 PAVGB" 1(%1), %%mm0 \n\t" | |
340 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
341 PAVGB" 9(%1), %%mm2 \n\t" | |
342 PAVGB" 9(%1, %3), %%mm3 \n\t" | |
343 "movq %%mm0, (%2) \n\t" | |
344 "movq %%mm1, (%2, %3) \n\t" | |
345 "movq %%mm2, 8(%2) \n\t" | |
346 "movq %%mm3, 8(%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
347 "add %%"REG_a", %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
348 "add %%"REG_a", %2 \n\t" |
651 | 349 "movq (%1), %%mm0 \n\t" |
350 "movq (%1, %3), %%mm1 \n\t" | |
351 "movq 8(%1), %%mm2 \n\t" | |
352 "movq 8(%1, %3), %%mm3 \n\t" | |
353 PAVGB" 1(%1), %%mm0 \n\t" | |
354 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
355 PAVGB" 9(%1), %%mm2 \n\t" | |
356 PAVGB" 9(%1, %3), %%mm3 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
357 "add %%"REG_a", %1 \n\t" |
651 | 358 "movq %%mm0, (%2) \n\t" |
359 "movq %%mm1, (%2, %3) \n\t" | |
360 "movq %%mm2, 8(%2) \n\t" | |
361 "movq %%mm3, 8(%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
362 "add %%"REG_a", %2 \n\t" |
651 | 363 "subl $4, %0 \n\t" |
364 "jnz 1b \n\t" | |
365 :"+g"(h), "+S"(pixels), "+D"(block) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
366 :"r" ((long)line_size) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
367 :"%"REG_a, "memory"); |
651 | 368 } |
954 | 369 |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
370 static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
954 | 371 { |
372 __asm __volatile( | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
373 "testl $1, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
374 " jz 1f \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
375 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
376 "movq 8(%1), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
377 PAVGB" (%2), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
378 PAVGB" 8(%2), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
379 "add %4, %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
380 "add $16, %2 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
381 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
382 "movq %%mm1, 8(%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
383 "add %5, %3 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
384 "decl %0 \n\t" |
954 | 385 "1: \n\t" |
386 "movq (%1), %%mm0 \n\t" | |
387 "movq 8(%1), %%mm1 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
388 "add %4, %1 \n\t" |
954 | 389 PAVGB" (%2), %%mm0 \n\t" |
390 PAVGB" 8(%2), %%mm1 \n\t" | |
391 "movq %%mm0, (%3) \n\t" | |
392 "movq %%mm1, 8(%3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
393 "add %5, %3 \n\t" |
954 | 394 "movq (%1), %%mm0 \n\t" |
395 "movq 8(%1), %%mm1 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
396 "add %4, %1 \n\t" |
954 | 397 PAVGB" 16(%2), %%mm0 \n\t" |
398 PAVGB" 24(%2), %%mm1 \n\t" | |
399 "movq %%mm0, (%3) \n\t" | |
400 "movq %%mm1, 8(%3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
401 "add %5, %3 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
402 "add $32, %2 \n\t" |
954 | 403 "subl $2, %0 \n\t" |
404 "jnz 1b \n\t" | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
405 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
406 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
407 #else |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
408 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
409 #endif |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
410 :"S"((long)src1Stride), "D"((long)dstStride) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
411 :"memory"); |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
412 //the following should be used, though better not with gcc ... |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
413 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
414 :"r"(src1Stride), "r"(dstStride) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
415 :"memory");*/ |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
416 } |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
417 |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
418 static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
419 { |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
420 __asm __volatile( |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
421 "testl $1, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
422 " jz 1f \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
423 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
424 "movq 8(%1), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
425 PAVGB" (%2), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
426 PAVGB" 8(%2), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
427 "add %4, %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
428 "add $16, %2 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
429 PAVGB" (%3), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
430 PAVGB" 8(%3), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
431 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
432 "movq %%mm1, 8(%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
433 "add %5, %3 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
434 "decl %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
435 "1: \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
436 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
437 "movq 8(%1), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
438 "add %4, %1 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
439 PAVGB" (%2), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
440 PAVGB" 8(%2), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
441 PAVGB" (%3), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
442 PAVGB" 8(%3), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
443 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
444 "movq %%mm1, 8(%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
445 "add %5, %3 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
446 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
447 "movq 8(%1), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
448 "add %4, %1 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
449 PAVGB" 16(%2), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
450 PAVGB" 24(%2), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
451 PAVGB" (%3), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
452 PAVGB" 8(%3), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
453 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
454 "movq %%mm1, 8(%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
455 "add %5, %3 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
456 "add $32, %2 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
457 "subl $2, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
458 "jnz 1b \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
459 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
460 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
461 #else |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
462 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
463 #endif |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
464 :"S"((long)src1Stride), "D"((long)dstStride) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
465 :"memory"); |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
466 //the following should be used, though better not with gcc ... |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
467 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
954 | 468 :"r"(src1Stride), "r"(dstStride) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
469 :"memory");*/ |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
470 } |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
471 |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
472 static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
473 { |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
474 __asm __volatile( |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
475 "pcmpeqb %%mm6, %%mm6\n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
476 "testl $1, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
477 " jz 1f \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
478 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
479 "movq 8(%1), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
480 "movq (%2), %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
481 "movq 8(%2), %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
482 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
483 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
484 "pxor %%mm6, %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
485 "pxor %%mm6, %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
486 PAVGB" %%mm2, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
487 PAVGB" %%mm3, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
488 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
489 "pxor %%mm6, %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
490 "add %4, %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
491 "add $16, %2 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
492 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
493 "movq %%mm1, 8(%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
494 "add %5, %3 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
495 "decl %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
496 "1: \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
497 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
498 "movq 8(%1), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
499 "add %4, %1 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
500 "movq (%2), %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
501 "movq 8(%2), %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
502 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
503 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
504 "pxor %%mm6, %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
505 "pxor %%mm6, %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
506 PAVGB" %%mm2, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
507 PAVGB" %%mm3, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
508 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
509 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
510 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
511 "movq %%mm1, 8(%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
512 "add %5, %3 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
513 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
514 "movq 8(%1), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
515 "add %4, %1 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
516 "movq 16(%2), %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
517 "movq 24(%2), %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
518 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
519 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
520 "pxor %%mm6, %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
521 "pxor %%mm6, %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
522 PAVGB" %%mm2, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
523 PAVGB" %%mm3, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
524 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
525 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
526 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
527 "movq %%mm1, 8(%3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
528 "add %5, %3 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
529 "add $32, %2 \n\t" |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
530 "subl $2, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
531 "jnz 1b \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
532 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
533 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
534 #else |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
535 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
536 #endif |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
537 :"S"((long)src1Stride), "D"((long)dstStride) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
538 :"memory"); |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
539 //the following should be used, though better not with gcc ... |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
540 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
541 :"r"(src1Stride), "r"(dstStride) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
542 :"memory");*/ |
954 | 543 } |
441 | 544 |
545 /* GL: this function does incorrect rounding if overflow */ | |
1064 | 546 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
441 | 547 { |
448 | 548 MOVQ_BONE(mm6); |
441 | 549 __asm __volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
550 "lea (%3, %3), %%"REG_a" \n\t" |
439 | 551 "1: \n\t" |
552 "movq (%1), %%mm0 \n\t" | |
553 "movq (%1, %3), %%mm2 \n\t" | |
554 "movq 1(%1), %%mm1 \n\t" | |
555 "movq 1(%1, %3), %%mm3 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
556 "add %%"REG_a", %1 \n\t" |
448 | 557 "psubusb %%mm6, %%mm0 \n\t" |
558 "psubusb %%mm6, %%mm2 \n\t" | |
386 | 559 PAVGB" %%mm1, %%mm0 \n\t" |
560 PAVGB" %%mm3, %%mm2 \n\t" | |
439 | 561 "movq %%mm0, (%2) \n\t" |
562 "movq %%mm2, (%2, %3) \n\t" | |
563 "movq (%1), %%mm0 \n\t" | |
564 "movq 1(%1), %%mm1 \n\t" | |
565 "movq (%1, %3), %%mm2 \n\t" | |
566 "movq 1(%1, %3), %%mm3 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
567 "add %%"REG_a", %2 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
568 "add %%"REG_a", %1 \n\t" |
448 | 569 "psubusb %%mm6, %%mm0 \n\t" |
570 "psubusb %%mm6, %%mm2 \n\t" | |
386 | 571 PAVGB" %%mm1, %%mm0 \n\t" |
572 PAVGB" %%mm3, %%mm2 \n\t" | |
439 | 573 "movq %%mm0, (%2) \n\t" |
574 "movq %%mm2, (%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
575 "add %%"REG_a", %2 \n\t" |
441 | 576 "subl $4, %0 \n\t" |
577 "jnz 1b \n\t" | |
578 :"+g"(h), "+S"(pixels), "+D"(block) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
579 :"r" ((long)line_size) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
580 :"%"REG_a, "memory"); |
0 | 581 } |
582 | |
1064 | 583 static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 584 { |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
585 __asm __volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
586 "lea (%3, %3), %%"REG_a" \n\t" |
441 | 587 "movq (%1), %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
588 "sub %3, %2 \n\t" |
441 | 589 "1: \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
590 "movq (%1, %3), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
591 "movq (%1, %%"REG_a"), %%mm2 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
592 "add %%"REG_a", %1 \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
593 PAVGB" %%mm1, %%mm0 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
594 PAVGB" %%mm2, %%mm1 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
595 "movq %%mm0, (%2, %3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
596 "movq %%mm1, (%2, %%"REG_a") \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
597 "movq (%1, %3), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
598 "movq (%1, %%"REG_a"), %%mm0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
599 "add %%"REG_a", %2 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
600 "add %%"REG_a", %1 \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
601 PAVGB" %%mm1, %%mm2 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
602 PAVGB" %%mm0, %%mm1 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
603 "movq %%mm2, (%2, %3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
604 "movq %%mm1, (%2, %%"REG_a") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
605 "add %%"REG_a", %2 \n\t" |
441 | 606 "subl $4, %0 \n\t" |
607 "jnz 1b \n\t" | |
608 :"+g"(h), "+S"(pixels), "+D" (block) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
609 :"r" ((long)line_size) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
610 :"%"REG_a, "memory"); |
386 | 611 } |
612 | |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
613 /* GL: this function does incorrect rounding if overflow */ |
1064 | 614 static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
386 | 615 { |
448 | 616 MOVQ_BONE(mm6); |
0 | 617 __asm __volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
618 "lea (%3, %3), %%"REG_a" \n\t" |
441 | 619 "movq (%1), %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
620 "sub %3, %2 \n\t" |
441 | 621 "1: \n\t" |
439 | 622 "movq (%1, %3), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
623 "movq (%1, %%"REG_a"), %%mm2 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
624 "add %%"REG_a", %1 \n\t" |
448 | 625 "psubusb %%mm6, %%mm1 \n\t" |
386 | 626 PAVGB" %%mm1, %%mm0 \n\t" |
627 PAVGB" %%mm2, %%mm1 \n\t" | |
439 | 628 "movq %%mm0, (%2, %3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
629 "movq %%mm1, (%2, %%"REG_a") \n\t" |
439 | 630 "movq (%1, %3), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
631 "movq (%1, %%"REG_a"), %%mm0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
632 "add %%"REG_a", %2 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
633 "add %%"REG_a", %1 \n\t" |
448 | 634 "psubusb %%mm6, %%mm1 \n\t" |
386 | 635 PAVGB" %%mm1, %%mm2 \n\t" |
636 PAVGB" %%mm0, %%mm1 \n\t" | |
439 | 637 "movq %%mm2, (%2, %3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
638 "movq %%mm1, (%2, %%"REG_a") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
639 "add %%"REG_a", %2 \n\t" |
441 | 640 "subl $4, %0 \n\t" |
641 "jnz 1b \n\t" | |
642 :"+g"(h), "+S"(pixels), "+D" (block) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
643 :"r" ((long)line_size) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
644 :"%"REG_a, "memory"); |
0 | 645 } |
646 | |
1064 | 647 static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 648 { |
649 __asm __volatile( | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
650 "lea (%3, %3), %%"REG_a" \n\t" |
441 | 651 "1: \n\t" |
652 "movq (%2), %%mm0 \n\t" | |
653 "movq (%2, %3), %%mm1 \n\t" | |
654 PAVGB" (%1), %%mm0 \n\t" | |
655 PAVGB" (%1, %3), %%mm1 \n\t" | |
656 "movq %%mm0, (%2) \n\t" | |
657 "movq %%mm1, (%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
658 "add %%"REG_a", %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
659 "add %%"REG_a", %2 \n\t" |
441 | 660 "movq (%2), %%mm0 \n\t" |
661 "movq (%2, %3), %%mm1 \n\t" | |
662 PAVGB" (%1), %%mm0 \n\t" | |
663 PAVGB" (%1, %3), %%mm1 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
664 "add %%"REG_a", %1 \n\t" |
441 | 665 "movq %%mm0, (%2) \n\t" |
666 "movq %%mm1, (%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
667 "add %%"REG_a", %2 \n\t" |
441 | 668 "subl $4, %0 \n\t" |
669 "jnz 1b \n\t" | |
670 :"+g"(h), "+S"(pixels), "+D"(block) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
671 :"r" ((long)line_size) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
672 :"%"REG_a, "memory"); |
0 | 673 } |
674 | |
1064 | 675 static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 676 { |
677 __asm __volatile( | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
678 "lea (%3, %3), %%"REG_a" \n\t" |
441 | 679 "1: \n\t" |
680 "movq (%1), %%mm0 \n\t" | |
681 "movq (%1, %3), %%mm2 \n\t" | |
682 PAVGB" 1(%1), %%mm0 \n\t" | |
683 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
684 PAVGB" (%2), %%mm0 \n\t" | |
685 PAVGB" (%2, %3), %%mm2 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
686 "add %%"REG_a", %1 \n\t" |
441 | 687 "movq %%mm0, (%2) \n\t" |
688 "movq %%mm2, (%2, %3) \n\t" | |
689 "movq (%1), %%mm0 \n\t" | |
690 "movq (%1, %3), %%mm2 \n\t" | |
691 PAVGB" 1(%1), %%mm0 \n\t" | |
692 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
693 "add %%"REG_a", %2 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
694 "add %%"REG_a", %1 \n\t" |
441 | 695 PAVGB" (%2), %%mm0 \n\t" |
696 PAVGB" (%2, %3), %%mm2 \n\t" | |
697 "movq %%mm0, (%2) \n\t" | |
698 "movq %%mm2, (%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
699 "add %%"REG_a", %2 \n\t" |
441 | 700 "subl $4, %0 \n\t" |
701 "jnz 1b \n\t" | |
702 :"+g"(h), "+S"(pixels), "+D"(block) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
703 :"r" ((long)line_size) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
704 :"%"REG_a, "memory"); |
0 | 705 } |
706 | |
1064 | 707 static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 708 { |
709 __asm __volatile( | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
710 "lea (%3, %3), %%"REG_a" \n\t" |
386 | 711 "movq (%1), %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
712 "sub %3, %2 \n\t" |
441 | 713 "1: \n\t" |
714 "movq (%1, %3), %%mm1 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
715 "movq (%1, %%"REG_a"), %%mm2 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
716 "add %%"REG_a", %1 \n\t" |
386 | 717 PAVGB" %%mm1, %%mm0 \n\t" |
718 PAVGB" %%mm2, %%mm1 \n\t" | |
441 | 719 "movq (%2, %3), %%mm3 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
720 "movq (%2, %%"REG_a"), %%mm4 \n\t" |
386 | 721 PAVGB" %%mm3, %%mm0 \n\t" |
722 PAVGB" %%mm4, %%mm1 \n\t" | |
441 | 723 "movq %%mm0, (%2, %3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
724 "movq %%mm1, (%2, %%"REG_a") \n\t" |
441 | 725 "movq (%1, %3), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
726 "movq (%1, %%"REG_a"), %%mm0 \n\t" |
386 | 727 PAVGB" %%mm1, %%mm2 \n\t" |
728 PAVGB" %%mm0, %%mm1 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
729 "add %%"REG_a", %2 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
730 "add %%"REG_a", %1 \n\t" |
441 | 731 "movq (%2, %3), %%mm3 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
732 "movq (%2, %%"REG_a"), %%mm4 \n\t" |
386 | 733 PAVGB" %%mm3, %%mm2 \n\t" |
734 PAVGB" %%mm4, %%mm1 \n\t" | |
441 | 735 "movq %%mm2, (%2, %3) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
736 "movq %%mm1, (%2, %%"REG_a") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
737 "add %%"REG_a", %2 \n\t" |
441 | 738 "subl $4, %0 \n\t" |
739 "jnz 1b \n\t" | |
740 :"+g"(h), "+S"(pixels), "+D"(block) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
741 :"r" ((long)line_size) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
742 :"%"REG_a, "memory"); |
0 | 743 } |
744 | |
386 | 745 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter |
1064 | 746 static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 747 { |
448 | 748 MOVQ_BONE(mm6); |
0 | 749 __asm __volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
750 "lea (%3, %3), %%"REG_a" \n\t" |
386 | 751 "movq (%1), %%mm0 \n\t" |
442 | 752 PAVGB" 1(%1), %%mm0 \n\t" |
753 ".balign 8 \n\t" | |
441 | 754 "1: \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
755 "movq (%1, %%"REG_a"), %%mm2 \n\t" |
442 | 756 "movq (%1, %3), %%mm1 \n\t" |
448 | 757 "psubusb %%mm6, %%mm2 \n\t" |
442 | 758 PAVGB" 1(%1, %3), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
759 PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
760 "add %%"REG_a", %1 \n\t" |
386 | 761 PAVGB" %%mm1, %%mm0 \n\t" |
762 PAVGB" %%mm2, %%mm1 \n\t" | |
442 | 763 PAVGB" (%2), %%mm0 \n\t" |
764 PAVGB" (%2, %3), %%mm1 \n\t" | |
765 "movq %%mm0, (%2) \n\t" | |
766 "movq %%mm1, (%2, %3) \n\t" | |
767 "movq (%1, %3), %%mm1 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
768 "movq (%1, %%"REG_a"), %%mm0 \n\t" |
442 | 769 PAVGB" 1(%1, %3), %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
770 PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
771 "add %%"REG_a", %2 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
772 "add %%"REG_a", %1 \n\t" |
386 | 773 PAVGB" %%mm1, %%mm2 \n\t" |
774 PAVGB" %%mm0, %%mm1 \n\t" | |
442 | 775 PAVGB" (%2), %%mm2 \n\t" |
776 PAVGB" (%2, %3), %%mm1 \n\t" | |
777 "movq %%mm2, (%2) \n\t" | |
778 "movq %%mm1, (%2, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
779 "add %%"REG_a", %2 \n\t" |
441 | 780 "subl $4, %0 \n\t" |
442 | 781 "jnz 1b \n\t" |
443 | 782 :"+g"(h), "+S"(pixels), "+D"(block) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
783 :"r" ((long)line_size) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2209
diff
changeset
|
784 :"%"REG_a, "memory"); |
0 | 785 } |
651 | 786 |
787 //FIXME the following could be optimized too ... | |
1064 | 788 static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 789 DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); |
790 DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); | |
791 } | |
1064 | 792 static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 793 DEF(put_pixels8_y2)(block , pixels , line_size, h); |
794 DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h); | |
795 } | |
1064 | 796 static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 797 DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h); |
798 DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); | |
799 } | |
1064 | 800 static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 801 DEF(avg_pixels8)(block , pixels , line_size, h); |
802 DEF(avg_pixels8)(block+8, pixels+8, line_size, h); | |
803 } | |
1064 | 804 static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 805 DEF(avg_pixels8_x2)(block , pixels , line_size, h); |
806 DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h); | |
807 } | |
1064 | 808 static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 809 DEF(avg_pixels8_y2)(block , pixels , line_size, h); |
810 DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h); | |
811 } | |
1064 | 812 static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 813 DEF(avg_pixels8_xy2)(block , pixels , line_size, h); |
814 DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); | |
815 } | |
816 |