Mercurial > libavcodec.hg
annotate i386/dsputil_mmx_avg.h @ 2207:22b768f1261a libavcodec
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
now
author | michael |
---|---|
date | Mon, 06 Sep 2004 03:17:31 +0000 |
parents | f65d87bfdd5a |
children | c4a476971abc |
rev | line source |
---|---|
0 | 1 /* |
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2 | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
1739
07a484280a82
copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents:
1064
diff
changeset
|
4 * Copyright (c) 2002-2004 Michael Niedermayer |
0 | 5 * |
429 | 6 * This library is free software; you can redistribute it and/or |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2 of the License, or (at your option) any later version. | |
0 | 10 * |
429 | 11 * This library is distributed in the hope that it will be useful, |
0 | 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 * Lesser General Public License for more details. | |
0 | 15 * |
429 | 16 * You should have received a copy of the GNU Lesser General Public |
17 * License along with this library; if not, write to the Free Software | |
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
0 | 19 * |
20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
386 | 21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> |
441 | 22 * and improved by Zdenek Kabelac <kabi@users.sf.net> |
0 | 23 */ |
387 | 24 |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
25 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
26 clobber bug - now it will work with 2.95.2 and also with -fPIC |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
27 */ |
1064 | 28 static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 29 { |
386 | 30 __asm __volatile( |
441 | 31 "lea (%3, %3), %%eax \n\t" |
32 "1: \n\t" | |
33 "movq (%1), %%mm0 \n\t" | |
34 "movq (%1, %3), %%mm1 \n\t" | |
35 PAVGB" 1(%1), %%mm0 \n\t" | |
36 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
37 "movq %%mm0, (%2) \n\t" | |
38 "movq %%mm1, (%2, %3) \n\t" | |
39 "addl %%eax, %1 \n\t" | |
40 "addl %%eax, %2 \n\t" | |
41 "movq (%1), %%mm0 \n\t" | |
42 "movq (%1, %3), %%mm1 \n\t" | |
43 PAVGB" 1(%1), %%mm0 \n\t" | |
44 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
45 "addl %%eax, %1 \n\t" | |
46 "movq %%mm0, (%2) \n\t" | |
47 "movq %%mm1, (%2, %3) \n\t" | |
48 "addl %%eax, %2 \n\t" | |
49 "subl $4, %0 \n\t" | |
50 "jnz 1b \n\t" | |
51 :"+g"(h), "+S"(pixels), "+D"(block) | |
52 :"r" (line_size) | |
53 :"%eax", "memory"); | |
54 } | |
651 | 55 |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
56 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
954 | 57 { |
58 __asm __volatile( | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
59 "testl $1, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
60 " jz 1f \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
61 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
62 "movq (%2), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
63 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
64 "addl $8, %2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
65 PAVGB" %%mm1, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
66 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
67 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
68 "decl %0 \n\t" |
954 | 69 "1: \n\t" |
70 "movq (%1), %%mm0 \n\t" | |
71 "addl %4, %1 \n\t" | |
72 "movq (%1), %%mm1 \n\t" | |
73 "addl %4, %1 \n\t" | |
74 PAVGB" (%2), %%mm0 \n\t" | |
75 PAVGB" 8(%2), %%mm1 \n\t" | |
76 "movq %%mm0, (%3) \n\t" | |
77 "addl %5, %3 \n\t" | |
78 "movq %%mm1, (%3) \n\t" | |
79 "addl %5, %3 \n\t" | |
80 "movq (%1), %%mm0 \n\t" | |
81 "addl %4, %1 \n\t" | |
82 "movq (%1), %%mm1 \n\t" | |
83 "addl %4, %1 \n\t" | |
84 PAVGB" 16(%2), %%mm0 \n\t" | |
85 PAVGB" 24(%2), %%mm1 \n\t" | |
86 "movq %%mm0, (%3) \n\t" | |
87 "addl %5, %3 \n\t" | |
88 "movq %%mm1, (%3) \n\t" | |
89 "addl %5, %3 \n\t" | |
90 "addl $32, %2 \n\t" | |
91 "subl $4, %0 \n\t" | |
92 "jnz 1b \n\t" | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
93 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
94 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
95 #else |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
96 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
97 #endif |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
98 :"S"(src1Stride), "D"(dstStride) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
99 :"memory"); |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
100 //the following should be used, though better not with gcc ... |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
101 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
954 | 102 :"r"(src1Stride), "r"(dstStride) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
103 :"memory");*/ |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
104 } |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
105 |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
106 static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
107 { |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
108 __asm __volatile( |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
109 "pcmpeqb %%mm6, %%mm6 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
110 "testl $1, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
111 " jz 1f \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
112 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
113 "movq (%2), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
114 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
115 "addl $8, %2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
116 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
117 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
118 PAVGB" %%mm1, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
119 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
120 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
121 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
122 "decl %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
123 "1: \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
124 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
125 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
126 "movq (%1), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
127 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
128 "movq (%2), %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
129 "movq 8(%2), %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
130 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
131 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
132 "pxor %%mm6, %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
133 "pxor %%mm6, %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
134 PAVGB" %%mm2, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
135 PAVGB" %%mm3, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
136 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
137 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
138 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
139 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
140 "movq %%mm1, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
141 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
142 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
143 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
144 "movq (%1), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
145 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
146 "movq 16(%2), %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
147 "movq 24(%2), %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
148 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
149 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
150 "pxor %%mm6, %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
151 "pxor %%mm6, %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
152 PAVGB" %%mm2, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
153 PAVGB" %%mm3, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
154 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
155 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
156 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
157 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
158 "movq %%mm1, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
159 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
160 "addl $32, %2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
161 "subl $4, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
162 "jnz 1b \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
163 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
164 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
165 #else |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
166 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
167 #endif |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
168 :"S"(src1Stride), "D"(dstStride) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
169 :"memory"); |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
170 //the following should be used, though better not with gcc ... |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
171 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
172 :"r"(src1Stride), "r"(dstStride) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
173 :"memory");*/ |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
174 } |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
175 |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
176 static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
177 { |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
178 __asm __volatile( |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
179 "testl $1, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
180 " jz 1f \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
181 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
182 "movq (%2), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
183 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
184 "addl $8, %2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
185 PAVGB" %%mm1, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
186 PAVGB" (%3), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
187 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
188 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
189 "decl %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
190 "1: \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
191 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
192 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
193 "movq (%1), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
194 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
195 PAVGB" (%2), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
196 PAVGB" 8(%2), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
197 PAVGB" (%3), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
198 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
199 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
200 PAVGB" (%3), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
201 "movq %%mm1, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
202 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
203 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
204 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
205 "movq (%1), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
206 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
207 PAVGB" 16(%2), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
208 PAVGB" 24(%2), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
209 PAVGB" (%3), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
210 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
211 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
212 PAVGB" (%3), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
213 "movq %%mm1, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
214 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
215 "addl $32, %2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
216 "subl $4, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
217 "jnz 1b \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
218 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
219 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
220 #else |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
221 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
222 #endif |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
223 :"S"(src1Stride), "D"(dstStride) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
224 :"memory"); |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
225 //the following should be used, though better not with gcc ... |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
226 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
227 :"r"(src1Stride), "r"(dstStride) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
228 :"memory");*/ |
954 | 229 } |
230 | |
1064 | 231 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
651 | 232 { |
233 __asm __volatile( | |
234 "lea (%3, %3), %%eax \n\t" | |
235 "1: \n\t" | |
236 "movq (%1), %%mm0 \n\t" | |
237 "movq (%1, %3), %%mm1 \n\t" | |
238 "movq 8(%1), %%mm2 \n\t" | |
239 "movq 8(%1, %3), %%mm3 \n\t" | |
240 PAVGB" 1(%1), %%mm0 \n\t" | |
241 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
242 PAVGB" 9(%1), %%mm2 \n\t" | |
243 PAVGB" 9(%1, %3), %%mm3 \n\t" | |
244 "movq %%mm0, (%2) \n\t" | |
245 "movq %%mm1, (%2, %3) \n\t" | |
246 "movq %%mm2, 8(%2) \n\t" | |
247 "movq %%mm3, 8(%2, %3) \n\t" | |
248 "addl %%eax, %1 \n\t" | |
249 "addl %%eax, %2 \n\t" | |
250 "movq (%1), %%mm0 \n\t" | |
251 "movq (%1, %3), %%mm1 \n\t" | |
252 "movq 8(%1), %%mm2 \n\t" | |
253 "movq 8(%1, %3), %%mm3 \n\t" | |
254 PAVGB" 1(%1), %%mm0 \n\t" | |
255 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
256 PAVGB" 9(%1), %%mm2 \n\t" | |
257 PAVGB" 9(%1, %3), %%mm3 \n\t" | |
258 "addl %%eax, %1 \n\t" | |
259 "movq %%mm0, (%2) \n\t" | |
260 "movq %%mm1, (%2, %3) \n\t" | |
261 "movq %%mm2, 8(%2) \n\t" | |
262 "movq %%mm3, 8(%2, %3) \n\t" | |
263 "addl %%eax, %2 \n\t" | |
264 "subl $4, %0 \n\t" | |
265 "jnz 1b \n\t" | |
266 :"+g"(h), "+S"(pixels), "+D"(block) | |
267 :"r" (line_size) | |
268 :"%eax", "memory"); | |
269 } | |
954 | 270 |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
271 static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
954 | 272 { |
273 __asm __volatile( | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
274 "testl $1, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
275 " jz 1f \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
276 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
277 "movq 8(%1), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
278 PAVGB" (%2), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
279 PAVGB" 8(%2), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
280 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
281 "addl $16, %2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
282 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
283 "movq %%mm1, 8(%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
284 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
285 "decl %0 \n\t" |
954 | 286 "1: \n\t" |
287 "movq (%1), %%mm0 \n\t" | |
288 "movq 8(%1), %%mm1 \n\t" | |
289 "addl %4, %1 \n\t" | |
290 PAVGB" (%2), %%mm0 \n\t" | |
291 PAVGB" 8(%2), %%mm1 \n\t" | |
292 "movq %%mm0, (%3) \n\t" | |
293 "movq %%mm1, 8(%3) \n\t" | |
294 "addl %5, %3 \n\t" | |
295 "movq (%1), %%mm0 \n\t" | |
296 "movq 8(%1), %%mm1 \n\t" | |
297 "addl %4, %1 \n\t" | |
298 PAVGB" 16(%2), %%mm0 \n\t" | |
299 PAVGB" 24(%2), %%mm1 \n\t" | |
300 "movq %%mm0, (%3) \n\t" | |
301 "movq %%mm1, 8(%3) \n\t" | |
302 "addl %5, %3 \n\t" | |
303 "addl $32, %2 \n\t" | |
304 "subl $2, %0 \n\t" | |
305 "jnz 1b \n\t" | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
306 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
307 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
308 #else |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
309 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
310 #endif |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
311 :"S"(src1Stride), "D"(dstStride) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
312 :"memory"); |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
313 //the following should be used, though better not with gcc ... |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
314 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
315 :"r"(src1Stride), "r"(dstStride) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
316 :"memory");*/ |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
317 } |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
318 |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
319 static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
320 { |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
321 __asm __volatile( |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
322 "testl $1, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
323 " jz 1f \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
324 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
325 "movq 8(%1), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
326 PAVGB" (%2), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
327 PAVGB" 8(%2), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
328 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
329 "addl $16, %2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
330 PAVGB" (%3), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
331 PAVGB" 8(%3), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
332 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
333 "movq %%mm1, 8(%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
334 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
335 "decl %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
336 "1: \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
337 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
338 "movq 8(%1), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
339 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
340 PAVGB" (%2), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
341 PAVGB" 8(%2), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
342 PAVGB" (%3), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
343 PAVGB" 8(%3), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
344 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
345 "movq %%mm1, 8(%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
346 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
347 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
348 "movq 8(%1), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
349 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
350 PAVGB" 16(%2), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
351 PAVGB" 24(%2), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
352 PAVGB" (%3), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
353 PAVGB" 8(%3), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
354 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
355 "movq %%mm1, 8(%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
356 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
357 "addl $32, %2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
358 "subl $2, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
359 "jnz 1b \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
360 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
361 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
362 #else |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
363 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
364 #endif |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
365 :"S"(src1Stride), "D"(dstStride) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
366 :"memory"); |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
367 //the following should be used, though better not with gcc ... |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
368 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
954 | 369 :"r"(src1Stride), "r"(dstStride) |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
370 :"memory");*/ |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
371 } |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
372 |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
373 static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
374 { |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
375 __asm __volatile( |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
376 "pcmpeqb %%mm6, %%mm6\n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
377 "testl $1, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
378 " jz 1f \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
379 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
380 "movq 8(%1), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
381 "movq (%2), %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
382 "movq 8(%2), %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
383 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
384 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
385 "pxor %%mm6, %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
386 "pxor %%mm6, %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
387 PAVGB" %%mm2, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
388 PAVGB" %%mm3, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
389 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
390 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
391 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
392 "addl $16, %2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
393 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
394 "movq %%mm1, 8(%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
395 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
396 "decl %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
397 "1: \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
398 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
399 "movq 8(%1), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
400 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
401 "movq (%2), %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
402 "movq 8(%2), %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
403 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
404 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
405 "pxor %%mm6, %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
406 "pxor %%mm6, %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
407 PAVGB" %%mm2, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
408 PAVGB" %%mm3, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
409 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
410 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
411 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
412 "movq %%mm1, 8(%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
413 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
414 "movq (%1), %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
415 "movq 8(%1), %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
416 "addl %4, %1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
417 "movq 16(%2), %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
418 "movq 24(%2), %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
419 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
420 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
421 "pxor %%mm6, %%mm2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
422 "pxor %%mm6, %%mm3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
423 PAVGB" %%mm2, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
424 PAVGB" %%mm3, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
425 "pxor %%mm6, %%mm0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
426 "pxor %%mm6, %%mm1 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
427 "movq %%mm0, (%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
428 "movq %%mm1, 8(%3) \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
429 "addl %5, %3 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
430 "addl $32, %2 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
431 "subl $2, %0 \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
432 "jnz 1b \n\t" |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
433 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
434 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
435 #else |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
436 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
437 #endif |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
438 :"S"(src1Stride), "D"(dstStride) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
439 :"memory"); |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
440 //the following should be used, though better not with gcc ... |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
441 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
442 :"r"(src1Stride), "r"(dstStride) |
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2024
diff
changeset
|
443 :"memory");*/ |
954 | 444 } |
441 | 445 |
446 /* GL: this function does incorrect rounding if overflow */ | |
1064 | 447 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
441 | 448 { |
448 | 449 MOVQ_BONE(mm6); |
441 | 450 __asm __volatile( |
451 "lea (%3, %3), %%eax \n\t" | |
439 | 452 "1: \n\t" |
453 "movq (%1), %%mm0 \n\t" | |
454 "movq (%1, %3), %%mm2 \n\t" | |
455 "movq 1(%1), %%mm1 \n\t" | |
456 "movq 1(%1, %3), %%mm3 \n\t" | |
441 | 457 "addl %%eax, %1 \n\t" |
448 | 458 "psubusb %%mm6, %%mm0 \n\t" |
459 "psubusb %%mm6, %%mm2 \n\t" | |
386 | 460 PAVGB" %%mm1, %%mm0 \n\t" |
461 PAVGB" %%mm3, %%mm2 \n\t" | |
439 | 462 "movq %%mm0, (%2) \n\t" |
463 "movq %%mm2, (%2, %3) \n\t" | |
464 "movq (%1), %%mm0 \n\t" | |
465 "movq 1(%1), %%mm1 \n\t" | |
466 "movq (%1, %3), %%mm2 \n\t" | |
467 "movq 1(%1, %3), %%mm3 \n\t" | |
441 | 468 "addl %%eax, %2 \n\t" |
439 | 469 "addl %%eax, %1 \n\t" |
448 | 470 "psubusb %%mm6, %%mm0 \n\t" |
471 "psubusb %%mm6, %%mm2 \n\t" | |
386 | 472 PAVGB" %%mm1, %%mm0 \n\t" |
473 PAVGB" %%mm3, %%mm2 \n\t" | |
439 | 474 "movq %%mm0, (%2) \n\t" |
475 "movq %%mm2, (%2, %3) \n\t" | |
441 | 476 "addl %%eax, %2 \n\t" |
477 "subl $4, %0 \n\t" | |
478 "jnz 1b \n\t" | |
479 :"+g"(h), "+S"(pixels), "+D"(block) | |
480 :"r" (line_size) | |
386 | 481 :"%eax", "memory"); |
0 | 482 } |
483 | |
1064 | 484 static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 485 { |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
486 __asm __volatile( |
441 | 487 "lea (%3, %3), %%eax \n\t" |
488 "movq (%1), %%mm0 \n\t" | |
489 "subl %3, %2 \n\t" | |
490 "1: \n\t" | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
491 "movq (%1, %3), %%mm1 \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
492 "movq (%1, %%eax), %%mm2 \n\t" |
441 | 493 "addl %%eax, %1 \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
494 PAVGB" %%mm1, %%mm0 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
495 PAVGB" %%mm2, %%mm1 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
496 "movq %%mm0, (%2, %3) \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
497 "movq %%mm1, (%2, %%eax) \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
498 "movq (%1, %3), %%mm1 \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
499 "movq (%1, %%eax), %%mm0 \n\t" |
441 | 500 "addl %%eax, %2 \n\t" |
501 "addl %%eax, %1 \n\t" | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
502 PAVGB" %%mm1, %%mm2 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
503 PAVGB" %%mm0, %%mm1 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
504 "movq %%mm2, (%2, %3) \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
505 "movq %%mm1, (%2, %%eax) \n\t" |
441 | 506 "addl %%eax, %2 \n\t" |
507 "subl $4, %0 \n\t" | |
508 "jnz 1b \n\t" | |
509 :"+g"(h), "+S"(pixels), "+D" (block) | |
510 :"r" (line_size) | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
511 :"%eax", "memory"); |
386 | 512 } |
513 | |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
514 /* GL: this function does incorrect rounding if overflow */ |
1064 | 515 static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
386 | 516 { |
448 | 517 MOVQ_BONE(mm6); |
0 | 518 __asm __volatile( |
441 | 519 "lea (%3, %3), %%eax \n\t" |
520 "movq (%1), %%mm0 \n\t" | |
521 "subl %3, %2 \n\t" | |
522 "1: \n\t" | |
439 | 523 "movq (%1, %3), %%mm1 \n\t" |
524 "movq (%1, %%eax), %%mm2 \n\t" | |
441 | 525 "addl %%eax, %1 \n\t" |
448 | 526 "psubusb %%mm6, %%mm1 \n\t" |
386 | 527 PAVGB" %%mm1, %%mm0 \n\t" |
528 PAVGB" %%mm2, %%mm1 \n\t" | |
439 | 529 "movq %%mm0, (%2, %3) \n\t" |
530 "movq %%mm1, (%2, %%eax) \n\t" | |
531 "movq (%1, %3), %%mm1 \n\t" | |
532 "movq (%1, %%eax), %%mm0 \n\t" | |
441 | 533 "addl %%eax, %2 \n\t" |
534 "addl %%eax, %1 \n\t" | |
448 | 535 "psubusb %%mm6, %%mm1 \n\t" |
386 | 536 PAVGB" %%mm1, %%mm2 \n\t" |
537 PAVGB" %%mm0, %%mm1 \n\t" | |
439 | 538 "movq %%mm2, (%2, %3) \n\t" |
539 "movq %%mm1, (%2, %%eax) \n\t" | |
441 | 540 "addl %%eax, %2 \n\t" |
541 "subl $4, %0 \n\t" | |
542 "jnz 1b \n\t" | |
543 :"+g"(h), "+S"(pixels), "+D" (block) | |
544 :"r" (line_size) | |
439 | 545 :"%eax", "memory"); |
0 | 546 } |
547 | |
1064 | 548 static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 549 { |
550 __asm __volatile( | |
441 | 551 "lea (%3, %3), %%eax \n\t" |
552 "1: \n\t" | |
553 "movq (%2), %%mm0 \n\t" | |
554 "movq (%2, %3), %%mm1 \n\t" | |
555 PAVGB" (%1), %%mm0 \n\t" | |
556 PAVGB" (%1, %3), %%mm1 \n\t" | |
557 "movq %%mm0, (%2) \n\t" | |
558 "movq %%mm1, (%2, %3) \n\t" | |
559 "addl %%eax, %1 \n\t" | |
560 "addl %%eax, %2 \n\t" | |
561 "movq (%2), %%mm0 \n\t" | |
562 "movq (%2, %3), %%mm1 \n\t" | |
563 PAVGB" (%1), %%mm0 \n\t" | |
564 PAVGB" (%1, %3), %%mm1 \n\t" | |
565 "addl %%eax, %1 \n\t" | |
566 "movq %%mm0, (%2) \n\t" | |
567 "movq %%mm1, (%2, %3) \n\t" | |
568 "addl %%eax, %2 \n\t" | |
569 "subl $4, %0 \n\t" | |
570 "jnz 1b \n\t" | |
571 :"+g"(h), "+S"(pixels), "+D"(block) | |
572 :"r" (line_size) | |
386 | 573 :"%eax", "memory"); |
0 | 574 } |
575 | |
1064 | 576 static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 577 { |
578 __asm __volatile( | |
441 | 579 "lea (%3, %3), %%eax \n\t" |
580 "1: \n\t" | |
581 "movq (%1), %%mm0 \n\t" | |
582 "movq (%1, %3), %%mm2 \n\t" | |
583 PAVGB" 1(%1), %%mm0 \n\t" | |
584 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
585 PAVGB" (%2), %%mm0 \n\t" | |
586 PAVGB" (%2, %3), %%mm2 \n\t" | |
587 "addl %%eax, %1 \n\t" | |
588 "movq %%mm0, (%2) \n\t" | |
589 "movq %%mm2, (%2, %3) \n\t" | |
590 "movq (%1), %%mm0 \n\t" | |
591 "movq (%1, %3), %%mm2 \n\t" | |
592 PAVGB" 1(%1), %%mm0 \n\t" | |
593 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
594 "addl %%eax, %2 \n\t" | |
595 "addl %%eax, %1 \n\t" | |
596 PAVGB" (%2), %%mm0 \n\t" | |
597 PAVGB" (%2, %3), %%mm2 \n\t" | |
598 "movq %%mm0, (%2) \n\t" | |
599 "movq %%mm2, (%2, %3) \n\t" | |
600 "addl %%eax, %2 \n\t" | |
601 "subl $4, %0 \n\t" | |
602 "jnz 1b \n\t" | |
603 :"+g"(h), "+S"(pixels), "+D"(block) | |
604 :"r" (line_size) | |
386 | 605 :"%eax", "memory"); |
0 | 606 } |
607 | |
1064 | 608 static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 609 { |
610 __asm __volatile( | |
441 | 611 "lea (%3, %3), %%eax \n\t" |
386 | 612 "movq (%1), %%mm0 \n\t" |
441 | 613 "subl %3, %2 \n\t" |
614 "1: \n\t" | |
615 "movq (%1, %3), %%mm1 \n\t" | |
616 "movq (%1, %%eax), %%mm2 \n\t" | |
617 "addl %%eax, %1 \n\t" | |
386 | 618 PAVGB" %%mm1, %%mm0 \n\t" |
619 PAVGB" %%mm2, %%mm1 \n\t" | |
441 | 620 "movq (%2, %3), %%mm3 \n\t" |
621 "movq (%2, %%eax), %%mm4 \n\t" | |
386 | 622 PAVGB" %%mm3, %%mm0 \n\t" |
623 PAVGB" %%mm4, %%mm1 \n\t" | |
441 | 624 "movq %%mm0, (%2, %3) \n\t" |
625 "movq %%mm1, (%2, %%eax) \n\t" | |
626 "movq (%1, %3), %%mm1 \n\t" | |
627 "movq (%1, %%eax), %%mm0 \n\t" | |
386 | 628 PAVGB" %%mm1, %%mm2 \n\t" |
629 PAVGB" %%mm0, %%mm1 \n\t" | |
441 | 630 "addl %%eax, %2 \n\t" |
631 "addl %%eax, %1 \n\t" | |
632 "movq (%2, %3), %%mm3 \n\t" | |
633 "movq (%2, %%eax), %%mm4 \n\t" | |
386 | 634 PAVGB" %%mm3, %%mm2 \n\t" |
635 PAVGB" %%mm4, %%mm1 \n\t" | |
441 | 636 "movq %%mm2, (%2, %3) \n\t" |
637 "movq %%mm1, (%2, %%eax) \n\t" | |
638 "addl %%eax, %2 \n\t" | |
639 "subl $4, %0 \n\t" | |
640 "jnz 1b \n\t" | |
641 :"+g"(h), "+S"(pixels), "+D"(block) | |
642 :"r" (line_size) | |
643 :"%eax", "memory"); | |
0 | 644 } |
645 | |
386 | 646 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter |
1064 | 647 static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 648 { |
448 | 649 MOVQ_BONE(mm6); |
0 | 650 __asm __volatile( |
442 | 651 "lea (%3, %3), %%eax \n\t" |
386 | 652 "movq (%1), %%mm0 \n\t" |
442 | 653 PAVGB" 1(%1), %%mm0 \n\t" |
654 ".balign 8 \n\t" | |
441 | 655 "1: \n\t" |
442 | 656 "movq (%1, %%eax), %%mm2 \n\t" |
657 "movq (%1, %3), %%mm1 \n\t" | |
448 | 658 "psubusb %%mm6, %%mm2 \n\t" |
442 | 659 PAVGB" 1(%1, %3), %%mm1 \n\t" |
660 PAVGB" 1(%1, %%eax), %%mm2 \n\t" | |
661 "addl %%eax, %1 \n\t" | |
386 | 662 PAVGB" %%mm1, %%mm0 \n\t" |
663 PAVGB" %%mm2, %%mm1 \n\t" | |
442 | 664 PAVGB" (%2), %%mm0 \n\t" |
665 PAVGB" (%2, %3), %%mm1 \n\t" | |
666 "movq %%mm0, (%2) \n\t" | |
667 "movq %%mm1, (%2, %3) \n\t" | |
668 "movq (%1, %3), %%mm1 \n\t" | |
669 "movq (%1, %%eax), %%mm0 \n\t" | |
670 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
671 PAVGB" 1(%1, %%eax), %%mm0 \n\t" | |
672 "addl %%eax, %2 \n\t" | |
673 "addl %%eax, %1 \n\t" | |
386 | 674 PAVGB" %%mm1, %%mm2 \n\t" |
675 PAVGB" %%mm0, %%mm1 \n\t" | |
442 | 676 PAVGB" (%2), %%mm2 \n\t" |
677 PAVGB" (%2, %3), %%mm1 \n\t" | |
678 "movq %%mm2, (%2) \n\t" | |
679 "movq %%mm1, (%2, %3) \n\t" | |
680 "addl %%eax, %2 \n\t" | |
441 | 681 "subl $4, %0 \n\t" |
442 | 682 "jnz 1b \n\t" |
443 | 683 :"+g"(h), "+S"(pixels), "+D"(block) |
442 | 684 :"r" (line_size) |
386 | 685 :"%eax", "memory"); |
0 | 686 } |
651 | 687 |
688 //FIXME the following could be optimized too ... | |
1064 | 689 static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 690 DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); |
691 DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); | |
692 } | |
1064 | 693 static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 694 DEF(put_pixels8_y2)(block , pixels , line_size, h); |
695 DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h); | |
696 } | |
1064 | 697 static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 698 DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h); |
699 DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); | |
700 } | |
1064 | 701 static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 702 DEF(avg_pixels8)(block , pixels , line_size, h); |
703 DEF(avg_pixels8)(block+8, pixels+8, line_size, h); | |
704 } | |
1064 | 705 static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 706 DEF(avg_pixels8_x2)(block , pixels , line_size, h); |
707 DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h); | |
708 } | |
1064 | 709 static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 710 DEF(avg_pixels8_y2)(block , pixels , line_size, h); |
711 DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h); | |
712 } | |
1064 | 713 static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 714 DEF(avg_pixels8_xy2)(block , pixels , line_size, h); |
715 DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); | |
716 } | |
717 |