annotate i386/dsputil_mmx_avg.h @ 2463:9baa47d8297b libavcodec

check norm6 vlc validity as there are some bit sequences which dont corespond to any codeword, the other vlc tables all seem to be huffman tables though
author michael
date Tue, 25 Jan 2005 01:29:10 +0000
parents 15cfba1b97b5
children 00f608ae3791
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1 /*
986e461dc072 Initial revision
glantau
parents:
diff changeset
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2
429
718a22dc121f license/copyright change
glantau
parents: 416
diff changeset
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
1739
07a484280a82 copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents: 1064
diff changeset
4 * Copyright (c) 2002-2004 Michael Niedermayer
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
5 *
429
718a22dc121f license/copyright change
glantau
parents: 416
diff changeset
6 * This library is free software; you can redistribute it and/or
718a22dc121f license/copyright change
glantau
parents: 416
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
718a22dc121f license/copyright change
glantau
parents: 416
diff changeset
8 * License as published by the Free Software Foundation; either
718a22dc121f license/copyright change
glantau
parents: 416
diff changeset
9 * version 2 of the License, or (at your option) any later version.
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
10 *
429
718a22dc121f license/copyright change
glantau
parents: 416
diff changeset
11 * This library is distributed in the hope that it will be useful,
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
429
718a22dc121f license/copyright change
glantau
parents: 416
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
718a22dc121f license/copyright change
glantau
parents: 416
diff changeset
14 * Lesser General Public License for more details.
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
15 *
429
718a22dc121f license/copyright change
glantau
parents: 416
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
718a22dc121f license/copyright change
glantau
parents: 416
diff changeset
17 * License along with this library; if not, write to the Free Software
718a22dc121f license/copyright change
glantau
parents: 416
diff changeset
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
19 *
986e461dc072 Initial revision
glantau
parents:
diff changeset
20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
22 * and improved by Zdenek Kabelac <kabi@users.sf.net>
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
23 */
387
b8f3affeb8e1 shared lib support (req by kabi) ...
michaelni
parents: 386
diff changeset
24
389
f874493a1970 tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents: 387
diff changeset
25 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
413
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents: 402
diff changeset
26 clobber bug - now it will work with 2.95.2 and also with -fPIC
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents: 402
diff changeset
27 */
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 954
diff changeset
28 static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
29 {
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
30 __asm __volatile(
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
31 "lea (%3, %3), %%"REG_a" \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
32 "1: \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
33 "movq (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
34 "movq (%1, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
35 PAVGB" 1(%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
36 PAVGB" 1(%1, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
37 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
38 "movq %%mm1, (%2, %3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
39 "add %%"REG_a", %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
40 "add %%"REG_a", %2 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
41 "movq (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
42 "movq (%1, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
43 PAVGB" 1(%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
44 PAVGB" 1(%1, %3), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
45 "add %%"REG_a", %1 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
46 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
47 "movq %%mm1, (%2, %3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
48 "add %%"REG_a", %2 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
49 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
50 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
51 :"+g"(h), "+S"(pixels), "+D"(block)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
52 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
53 :"%"REG_a, "memory");
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
54 }
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
55
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
56 static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
57 {
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
58 __asm __volatile(
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
59 "testl $1, %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
60 " jz 1f \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
61 "movd (%1), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
62 "movd (%2), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
63 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
64 "add $4, %2 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
65 PAVGB" %%mm1, %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
66 "movd %%mm0, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
67 "add %5, %3 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
68 "decl %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
69 "1: \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
70 "movd (%1), %%mm0 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
71 "add %4, %1 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
72 "movd (%1), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
73 "add %4, %1 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
74 PAVGB" (%2), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
75 PAVGB" 4(%2), %%mm1 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
76 "movd %%mm0, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
77 "add %5, %3 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
78 "movd %%mm1, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
79 "add %5, %3 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
80 "movd (%1), %%mm0 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
81 "add %4, %1 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
82 "movd (%1), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
83 "add %4, %1 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
84 PAVGB" 8(%2), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
85 PAVGB" 12(%2), %%mm1 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
86 "movd %%mm0, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
87 "add %5, %3 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
88 "movd %%mm1, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
89 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
90 "add $16, %2 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
91 "subl $4, %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
92 "jnz 1b \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
93 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
94 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
95 #else
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
96 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
97 #endif
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
98 :"S"((long)src1Stride), "D"((long)dstStride)
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
99 :"memory");
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
100 }
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
101
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
102
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
103 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
104 {
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
105 __asm __volatile(
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
106 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
107 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
108 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
109 "movq (%2), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
110 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
111 "add $8, %2 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
112 PAVGB" %%mm1, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
113 "movq %%mm0, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
114 "add %5, %3 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
115 "decl %0 \n\t"
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
116 "1: \n\t"
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
117 "movq (%1), %%mm0 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
118 "add %4, %1 \n\t"
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
119 "movq (%1), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
120 "add %4, %1 \n\t"
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
121 PAVGB" (%2), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
122 PAVGB" 8(%2), %%mm1 \n\t"
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
123 "movq %%mm0, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
124 "add %5, %3 \n\t"
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
125 "movq %%mm1, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
126 "add %5, %3 \n\t"
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
127 "movq (%1), %%mm0 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
128 "add %4, %1 \n\t"
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
129 "movq (%1), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
130 "add %4, %1 \n\t"
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
131 PAVGB" 16(%2), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
132 PAVGB" 24(%2), %%mm1 \n\t"
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
133 "movq %%mm0, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
134 "add %5, %3 \n\t"
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
135 "movq %%mm1, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
136 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
137 "add $32, %2 \n\t"
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
138 "subl $4, %0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
139 "jnz 1b \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
140 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
141 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
142 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
143 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
144 #endif
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
145 :"S"((long)src1Stride), "D"((long)dstStride)
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
146 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
147 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
148 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
149 :"r"(src1Stride), "r"(dstStride)
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
150 :"memory");*/
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
151 }
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
152
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
153 static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
154 {
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
155 __asm __volatile(
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
156 "pcmpeqb %%mm6, %%mm6 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
157 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
158 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
159 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
160 "movq (%2), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
161 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
162 "add $8, %2 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
163 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
164 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
165 PAVGB" %%mm1, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
166 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
167 "movq %%mm0, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
168 "add %5, %3 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
169 "decl %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
170 "1: \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
171 "movq (%1), %%mm0 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
172 "add %4, %1 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
173 "movq (%1), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
174 "add %4, %1 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
175 "movq (%2), %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
176 "movq 8(%2), %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
177 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
178 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
179 "pxor %%mm6, %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
180 "pxor %%mm6, %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
181 PAVGB" %%mm2, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
182 PAVGB" %%mm3, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
183 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
184 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
185 "movq %%mm0, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
186 "add %5, %3 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
187 "movq %%mm1, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
188 "add %5, %3 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
189 "movq (%1), %%mm0 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
190 "add %4, %1 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
191 "movq (%1), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
192 "add %4, %1 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
193 "movq 16(%2), %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
194 "movq 24(%2), %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
195 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
196 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
197 "pxor %%mm6, %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
198 "pxor %%mm6, %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
199 PAVGB" %%mm2, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
200 PAVGB" %%mm3, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
201 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
202 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
203 "movq %%mm0, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
204 "add %5, %3 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
205 "movq %%mm1, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
206 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
207 "add $32, %2 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
208 "subl $4, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
209 "jnz 1b \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
210 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
211 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
212 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
213 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
214 #endif
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
215 :"S"((long)src1Stride), "D"((long)dstStride)
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
216 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
217 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
218 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
219 :"r"(src1Stride), "r"(dstStride)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
220 :"memory");*/
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
221 }
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
222
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
223 static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
224 {
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
225 __asm __volatile(
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
226 "testl $1, %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
227 " jz 1f \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
228 "movd (%1), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
229 "movd (%2), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
230 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
231 "add $4, %2 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
232 PAVGB" %%mm1, %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
233 PAVGB" (%3), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
234 "movd %%mm0, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
235 "add %5, %3 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
236 "decl %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
237 "1: \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
238 "movd (%1), %%mm0 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
239 "add %4, %1 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
240 "movd (%1), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
241 "add %4, %1 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
242 PAVGB" (%2), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
243 PAVGB" 4(%2), %%mm1 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
244 PAVGB" (%3), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
245 "movd %%mm0, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
246 "add %5, %3 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
247 PAVGB" (%3), %%mm1 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
248 "movd %%mm1, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
249 "add %5, %3 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
250 "movd (%1), %%mm0 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
251 "add %4, %1 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
252 "movd (%1), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
253 "add %4, %1 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
254 PAVGB" 8(%2), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
255 PAVGB" 12(%2), %%mm1 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
256 PAVGB" (%3), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
257 "movd %%mm0, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
258 "add %5, %3 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
259 PAVGB" (%3), %%mm1 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
260 "movd %%mm1, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
261 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
262 "add $16, %2 \n\t"
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
263 "subl $4, %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
264 "jnz 1b \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
265 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
266 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
267 #else
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
268 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
269 #endif
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
270 :"S"((long)src1Stride), "D"((long)dstStride)
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
271 :"memory");
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
272 }
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
273
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
274
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
275 static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
276 {
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
277 __asm __volatile(
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
278 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
279 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
280 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
281 "movq (%2), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
282 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
283 "add $8, %2 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
284 PAVGB" %%mm1, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
285 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
286 "movq %%mm0, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
287 "add %5, %3 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
288 "decl %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
289 "1: \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
290 "movq (%1), %%mm0 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
291 "add %4, %1 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
292 "movq (%1), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
293 "add %4, %1 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
294 PAVGB" (%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
295 PAVGB" 8(%2), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
296 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
297 "movq %%mm0, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
298 "add %5, %3 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
299 PAVGB" (%3), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
300 "movq %%mm1, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
301 "add %5, %3 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
302 "movq (%1), %%mm0 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
303 "add %4, %1 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
304 "movq (%1), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
305 "add %4, %1 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
306 PAVGB" 16(%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
307 PAVGB" 24(%2), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
308 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
309 "movq %%mm0, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
310 "add %5, %3 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
311 PAVGB" (%3), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
312 "movq %%mm1, (%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
313 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
314 "add $32, %2 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
315 "subl $4, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
316 "jnz 1b \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
317 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
318 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
319 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
320 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
321 #endif
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
322 :"S"((long)src1Stride), "D"((long)dstStride)
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
323 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
324 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
325 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
326 :"r"(src1Stride), "r"(dstStride)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
327 :"memory");*/
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
328 }
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
329
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 954
diff changeset
330 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
331 {
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
332 __asm __volatile(
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
333 "lea (%3, %3), %%"REG_a" \n\t"
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
334 "1: \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
335 "movq (%1), %%mm0 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
336 "movq (%1, %3), %%mm1 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
337 "movq 8(%1), %%mm2 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
338 "movq 8(%1, %3), %%mm3 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
339 PAVGB" 1(%1), %%mm0 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
340 PAVGB" 1(%1, %3), %%mm1 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
341 PAVGB" 9(%1), %%mm2 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
342 PAVGB" 9(%1, %3), %%mm3 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
343 "movq %%mm0, (%2) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
344 "movq %%mm1, (%2, %3) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
345 "movq %%mm2, 8(%2) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
346 "movq %%mm3, 8(%2, %3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
347 "add %%"REG_a", %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
348 "add %%"REG_a", %2 \n\t"
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
349 "movq (%1), %%mm0 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
350 "movq (%1, %3), %%mm1 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
351 "movq 8(%1), %%mm2 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
352 "movq 8(%1, %3), %%mm3 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
353 PAVGB" 1(%1), %%mm0 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
354 PAVGB" 1(%1, %3), %%mm1 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
355 PAVGB" 9(%1), %%mm2 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
356 PAVGB" 9(%1, %3), %%mm3 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
357 "add %%"REG_a", %1 \n\t"
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
358 "movq %%mm0, (%2) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
359 "movq %%mm1, (%2, %3) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
360 "movq %%mm2, 8(%2) \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
361 "movq %%mm3, 8(%2, %3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
362 "add %%"REG_a", %2 \n\t"
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
363 "subl $4, %0 \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
364 "jnz 1b \n\t"
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
365 :"+g"(h), "+S"(pixels), "+D"(block)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
366 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
367 :"%"REG_a, "memory");
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
368 }
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
369
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
370 static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
371 {
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
372 __asm __volatile(
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
373 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
374 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
375 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
376 "movq 8(%1), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
377 PAVGB" (%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
378 PAVGB" 8(%2), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
379 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
380 "add $16, %2 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
381 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
382 "movq %%mm1, 8(%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
383 "add %5, %3 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
384 "decl %0 \n\t"
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
385 "1: \n\t"
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
386 "movq (%1), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
387 "movq 8(%1), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
388 "add %4, %1 \n\t"
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
389 PAVGB" (%2), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
390 PAVGB" 8(%2), %%mm1 \n\t"
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
391 "movq %%mm0, (%3) \n\t"
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
392 "movq %%mm1, 8(%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
393 "add %5, %3 \n\t"
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
394 "movq (%1), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
395 "movq 8(%1), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
396 "add %4, %1 \n\t"
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
397 PAVGB" 16(%2), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
398 PAVGB" 24(%2), %%mm1 \n\t"
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
399 "movq %%mm0, (%3) \n\t"
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
400 "movq %%mm1, 8(%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
401 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
402 "add $32, %2 \n\t"
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
403 "subl $2, %0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
404 "jnz 1b \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
405 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
406 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
407 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
408 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
409 #endif
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
410 :"S"((long)src1Stride), "D"((long)dstStride)
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
411 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
412 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
413 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
414 :"r"(src1Stride), "r"(dstStride)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
415 :"memory");*/
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
416 }
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
417
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
418 static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
419 {
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
420 __asm __volatile(
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
421 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
422 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
423 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
424 "movq 8(%1), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
425 PAVGB" (%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
426 PAVGB" 8(%2), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
427 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
428 "add $16, %2 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
429 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
430 PAVGB" 8(%3), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
431 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
432 "movq %%mm1, 8(%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
433 "add %5, %3 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
434 "decl %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
435 "1: \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
436 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
437 "movq 8(%1), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
438 "add %4, %1 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
439 PAVGB" (%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
440 PAVGB" 8(%2), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
441 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
442 PAVGB" 8(%3), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
443 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
444 "movq %%mm1, 8(%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
445 "add %5, %3 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
446 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
447 "movq 8(%1), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
448 "add %4, %1 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
449 PAVGB" 16(%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
450 PAVGB" 24(%2), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
451 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
452 PAVGB" 8(%3), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
453 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
454 "movq %%mm1, 8(%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
455 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
456 "add $32, %2 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
457 "subl $2, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
458 "jnz 1b \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
459 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
460 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
461 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
462 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
463 #endif
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
464 :"S"((long)src1Stride), "D"((long)dstStride)
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
465 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
466 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
467 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
468 :"r"(src1Stride), "r"(dstStride)
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
469 :"memory");*/
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
470 }
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
471
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
472 static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
473 {
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
474 __asm __volatile(
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
475 "pcmpeqb %%mm6, %%mm6\n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
476 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
477 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
478 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
479 "movq 8(%1), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
480 "movq (%2), %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
481 "movq 8(%2), %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
482 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
483 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
484 "pxor %%mm6, %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
485 "pxor %%mm6, %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
486 PAVGB" %%mm2, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
487 PAVGB" %%mm3, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
488 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
489 "pxor %%mm6, %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
490 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
491 "add $16, %2 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
492 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
493 "movq %%mm1, 8(%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
494 "add %5, %3 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
495 "decl %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
496 "1: \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
497 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
498 "movq 8(%1), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
499 "add %4, %1 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
500 "movq (%2), %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
501 "movq 8(%2), %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
502 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
503 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
504 "pxor %%mm6, %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
505 "pxor %%mm6, %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
506 PAVGB" %%mm2, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
507 PAVGB" %%mm3, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
508 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
509 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
510 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
511 "movq %%mm1, 8(%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
512 "add %5, %3 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
513 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
514 "movq 8(%1), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
515 "add %4, %1 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
516 "movq 16(%2), %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
517 "movq 24(%2), %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
518 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
519 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
520 "pxor %%mm6, %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
521 "pxor %%mm6, %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
522 PAVGB" %%mm2, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
523 PAVGB" %%mm3, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
524 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
525 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
526 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
527 "movq %%mm1, 8(%3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
528 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
529 "add $32, %2 \n\t"
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
530 "subl $2, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
531 "jnz 1b \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
532 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
533 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
534 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
535 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
536 #endif
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
537 :"S"((long)src1Stride), "D"((long)dstStride)
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
538 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
539 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
540 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
541 :"r"(src1Stride), "r"(dstStride)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2024
diff changeset
542 :"memory");*/
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 651
diff changeset
543 }
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
544
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
545 /* GL: this function does incorrect rounding if overflow */
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 954
diff changeset
546 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
547 {
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 443
diff changeset
548 MOVQ_BONE(mm6);
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
549 __asm __volatile(
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
550 "lea (%3, %3), %%"REG_a" \n\t"
439
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
551 "1: \n\t"
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
552 "movq (%1), %%mm0 \n\t"
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
553 "movq (%1, %3), %%mm2 \n\t"
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
554 "movq 1(%1), %%mm1 \n\t"
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
555 "movq 1(%1, %3), %%mm3 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
556 "add %%"REG_a", %1 \n\t"
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 443
diff changeset
557 "psubusb %%mm6, %%mm0 \n\t"
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 443
diff changeset
558 "psubusb %%mm6, %%mm2 \n\t"
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
559 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
560 PAVGB" %%mm3, %%mm2 \n\t"
439
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
561 "movq %%mm0, (%2) \n\t"
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
562 "movq %%mm2, (%2, %3) \n\t"
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
563 "movq (%1), %%mm0 \n\t"
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
564 "movq 1(%1), %%mm1 \n\t"
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
565 "movq (%1, %3), %%mm2 \n\t"
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
566 "movq 1(%1, %3), %%mm3 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
567 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
568 "add %%"REG_a", %1 \n\t"
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 443
diff changeset
569 "psubusb %%mm6, %%mm0 \n\t"
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 443
diff changeset
570 "psubusb %%mm6, %%mm2 \n\t"
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
571 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
572 PAVGB" %%mm3, %%mm2 \n\t"
439
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
573 "movq %%mm0, (%2) \n\t"
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
574 "movq %%mm2, (%2, %3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
575 "add %%"REG_a", %2 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
576 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
577 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
578 :"+g"(h), "+S"(pixels), "+D"(block)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
579 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
580 :"%"REG_a, "memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
581 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
582
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 954
diff changeset
583 static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
584 {
413
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents: 402
diff changeset
585 __asm __volatile(
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
586 "lea (%3, %3), %%"REG_a" \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
587 "movq (%1), %%mm0 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
588 "sub %3, %2 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
589 "1: \n\t"
413
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents: 402
diff changeset
590 "movq (%1, %3), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
591 "movq (%1, %%"REG_a"), %%mm2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
592 "add %%"REG_a", %1 \n\t"
413
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents: 402
diff changeset
593 PAVGB" %%mm1, %%mm0 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents: 402
diff changeset
594 PAVGB" %%mm2, %%mm1 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents: 402
diff changeset
595 "movq %%mm0, (%2, %3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
596 "movq %%mm1, (%2, %%"REG_a") \n\t"
413
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents: 402
diff changeset
597 "movq (%1, %3), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
598 "movq (%1, %%"REG_a"), %%mm0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
599 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
600 "add %%"REG_a", %1 \n\t"
413
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents: 402
diff changeset
601 PAVGB" %%mm1, %%mm2 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents: 402
diff changeset
602 PAVGB" %%mm0, %%mm1 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents: 402
diff changeset
603 "movq %%mm2, (%2, %3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
604 "movq %%mm1, (%2, %%"REG_a") \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
605 "add %%"REG_a", %2 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
606 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
607 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
608 :"+g"(h), "+S"(pixels), "+D" (block)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
609 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
610 :"%"REG_a, "memory");
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
611 }
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
612
389
f874493a1970 tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents: 387
diff changeset
613 /* GL: this function does incorrect rounding if overflow */
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 954
diff changeset
614 static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
615 {
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 443
diff changeset
616 MOVQ_BONE(mm6);
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
617 __asm __volatile(
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
618 "lea (%3, %3), %%"REG_a" \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
619 "movq (%1), %%mm0 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
620 "sub %3, %2 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
621 "1: \n\t"
439
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
622 "movq (%1, %3), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
623 "movq (%1, %%"REG_a"), %%mm2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
624 "add %%"REG_a", %1 \n\t"
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 443
diff changeset
625 "psubusb %%mm6, %%mm1 \n\t"
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
626 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
627 PAVGB" %%mm2, %%mm1 \n\t"
439
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
628 "movq %%mm0, (%2, %3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
629 "movq %%mm1, (%2, %%"REG_a") \n\t"
439
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
630 "movq (%1, %3), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
631 "movq (%1, %%"REG_a"), %%mm0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
632 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
633 "add %%"REG_a", %1 \n\t"
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 443
diff changeset
634 "psubusb %%mm6, %%mm1 \n\t"
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
635 PAVGB" %%mm1, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
636 PAVGB" %%mm0, %%mm1 \n\t"
439
6ae275655a23 * more PIC friendly and faster code
kabi
parents: 429
diff changeset
637 "movq %%mm2, (%2, %3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
638 "movq %%mm1, (%2, %%"REG_a") \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
639 "add %%"REG_a", %2 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
640 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
641 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
642 :"+g"(h), "+S"(pixels), "+D" (block)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
643 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
644 :"%"REG_a, "memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
645 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
646
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 954
diff changeset
647 static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
648 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
649 __asm __volatile(
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
650 "lea (%3, %3), %%"REG_a" \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
651 "1: \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
652 "movq (%2), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
653 "movq (%2, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
654 PAVGB" (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
655 PAVGB" (%1, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
656 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
657 "movq %%mm1, (%2, %3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
658 "add %%"REG_a", %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
659 "add %%"REG_a", %2 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
660 "movq (%2), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
661 "movq (%2, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
662 PAVGB" (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
663 PAVGB" (%1, %3), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
664 "add %%"REG_a", %1 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
665 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
666 "movq %%mm1, (%2, %3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
667 "add %%"REG_a", %2 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
668 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
669 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
670 :"+g"(h), "+S"(pixels), "+D"(block)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
671 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
672 :"%"REG_a, "memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
673 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
674
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 954
diff changeset
675 static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
676 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
677 __asm __volatile(
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
678 "lea (%3, %3), %%"REG_a" \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
679 "1: \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
680 "movq (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
681 "movq (%1, %3), %%mm2 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
682 PAVGB" 1(%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
683 PAVGB" 1(%1, %3), %%mm2 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
684 PAVGB" (%2), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
685 PAVGB" (%2, %3), %%mm2 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
686 "add %%"REG_a", %1 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
687 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
688 "movq %%mm2, (%2, %3) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
689 "movq (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
690 "movq (%1, %3), %%mm2 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
691 PAVGB" 1(%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
692 PAVGB" 1(%1, %3), %%mm2 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
693 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
694 "add %%"REG_a", %1 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
695 PAVGB" (%2), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
696 PAVGB" (%2, %3), %%mm2 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
697 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
698 "movq %%mm2, (%2, %3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
699 "add %%"REG_a", %2 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
700 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
701 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
702 :"+g"(h), "+S"(pixels), "+D"(block)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
703 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
704 :"%"REG_a, "memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
705 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
706
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 954
diff changeset
707 static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
708 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
709 __asm __volatile(
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
710 "lea (%3, %3), %%"REG_a" \n\t"
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
711 "movq (%1), %%mm0 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
712 "sub %3, %2 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
713 "1: \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
714 "movq (%1, %3), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
715 "movq (%1, %%"REG_a"), %%mm2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
716 "add %%"REG_a", %1 \n\t"
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
717 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
718 PAVGB" %%mm2, %%mm1 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
719 "movq (%2, %3), %%mm3 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
720 "movq (%2, %%"REG_a"), %%mm4 \n\t"
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
721 PAVGB" %%mm3, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
722 PAVGB" %%mm4, %%mm1 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
723 "movq %%mm0, (%2, %3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
724 "movq %%mm1, (%2, %%"REG_a") \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
725 "movq (%1, %3), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
726 "movq (%1, %%"REG_a"), %%mm0 \n\t"
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
727 PAVGB" %%mm1, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
728 PAVGB" %%mm0, %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
729 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
730 "add %%"REG_a", %1 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
731 "movq (%2, %3), %%mm3 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
732 "movq (%2, %%"REG_a"), %%mm4 \n\t"
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
733 PAVGB" %%mm3, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
734 PAVGB" %%mm4, %%mm1 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
735 "movq %%mm2, (%2, %3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
736 "movq %%mm1, (%2, %%"REG_a") \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
737 "add %%"REG_a", %2 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
738 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
739 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
740 :"+g"(h), "+S"(pixels), "+D"(block)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
741 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
742 :"%"REG_a, "memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
743 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
744
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
745 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 954
diff changeset
746 static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
747 {
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 443
diff changeset
748 MOVQ_BONE(mm6);
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
749 __asm __volatile(
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
750 "lea (%3, %3), %%"REG_a" \n\t"
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
751 "movq (%1), %%mm0 \n\t"
442
006965950f49 * optimized remaing avg_pixels_xy2
kabi
parents: 441
diff changeset
752 PAVGB" 1(%1), %%mm0 \n\t"
006965950f49 * optimized remaing avg_pixels_xy2
kabi
parents: 441
diff changeset
753 ".balign 8 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
754 "1: \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
755 "movq (%1, %%"REG_a"), %%mm2 \n\t"
442
006965950f49 * optimized remaing avg_pixels_xy2
kabi
parents: 441
diff changeset
756 "movq (%1, %3), %%mm1 \n\t"
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 443
diff changeset
757 "psubusb %%mm6, %%mm2 \n\t"
442
006965950f49 * optimized remaing avg_pixels_xy2
kabi
parents: 441
diff changeset
758 PAVGB" 1(%1, %3), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
759 PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
760 "add %%"REG_a", %1 \n\t"
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
761 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
762 PAVGB" %%mm2, %%mm1 \n\t"
442
006965950f49 * optimized remaing avg_pixels_xy2
kabi
parents: 441
diff changeset
763 PAVGB" (%2), %%mm0 \n\t"
006965950f49 * optimized remaing avg_pixels_xy2
kabi
parents: 441
diff changeset
764 PAVGB" (%2, %3), %%mm1 \n\t"
006965950f49 * optimized remaing avg_pixels_xy2
kabi
parents: 441
diff changeset
765 "movq %%mm0, (%2) \n\t"
006965950f49 * optimized remaing avg_pixels_xy2
kabi
parents: 441
diff changeset
766 "movq %%mm1, (%2, %3) \n\t"
006965950f49 * optimized remaing avg_pixels_xy2
kabi
parents: 441
diff changeset
767 "movq (%1, %3), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
768 "movq (%1, %%"REG_a"), %%mm0 \n\t"
442
006965950f49 * optimized remaing avg_pixels_xy2
kabi
parents: 441
diff changeset
769 PAVGB" 1(%1, %3), %%mm1 \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
770 PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
771 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
772 "add %%"REG_a", %1 \n\t"
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
773 PAVGB" %%mm1, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 151
diff changeset
774 PAVGB" %%mm0, %%mm1 \n\t"
442
006965950f49 * optimized remaing avg_pixels_xy2
kabi
parents: 441
diff changeset
775 PAVGB" (%2), %%mm2 \n\t"
006965950f49 * optimized remaing avg_pixels_xy2
kabi
parents: 441
diff changeset
776 PAVGB" (%2, %3), %%mm1 \n\t"
006965950f49 * optimized remaing avg_pixels_xy2
kabi
parents: 441
diff changeset
777 "movq %%mm2, (%2) \n\t"
006965950f49 * optimized remaing avg_pixels_xy2
kabi
parents: 441
diff changeset
778 "movq %%mm1, (%2, %3) \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
779 "add %%"REG_a", %2 \n\t"
441
c0de4d3c7d3c * optimized avg_* functions (except xy2)
kabi
parents: 439
diff changeset
780 "subl $4, %0 \n\t"
442
006965950f49 * optimized remaing avg_pixels_xy2
kabi
parents: 441
diff changeset
781 "jnz 1b \n\t"
443
63467327c06c * cosmetic minor change
kabi
parents: 442
diff changeset
782 :"+g"(h), "+S"(pixels), "+D"(block)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
783 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2209
diff changeset
784 :"%"REG_a, "memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
785 }
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
786
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
787 //FIXME the following could be optimized too ...
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 954
diff changeset
788 static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
789 DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
790 DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
791 }
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 954
diff changeset
792 static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
793 DEF(put_pixels8_y2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
794 DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
795 }
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 954
diff changeset
796 static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
797 DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
798 DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
799 }
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 954
diff changeset
800 static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
801 DEF(avg_pixels8)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
802 DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
803 }
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 954
diff changeset
804 static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
805 DEF(avg_pixels8_x2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
806 DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
807 }
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 954
diff changeset
808 static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
809 DEF(avg_pixels8_y2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
810 DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
811 }
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 954
diff changeset
812 static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
813 DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
814 DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
815 }
45e8f39fda50 put/avg_pixels16
michaelni
parents: 448
diff changeset
816