libavcodec.hg: i386/dsputil_mmx

annotate i386/dsputil_mmx_avg.h @ 439:6ae275655a23 libavcodec

* more PIC friendly and faster code

author	kabi
date	Mon, 27 May 2002 14:09:10 +0000
parents	718a22dc121f
children	c0de4d3c7d3c

rev	line source
0 986e461dc072 Initial revision glantau parents: diff changeset	1 /*
986e461dc072 Initial revision glantau parents: diff changeset	2 * DSP utils : average functions are compiled twice for 3dnow/mmx2
429 718a22dc121f license/copyright change glantau parents: 416 diff changeset	3 * Copyright (c) 2000, 2001 Fabrice Bellard.
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	4 * Copyright (c) 2002 Michael Niedermayer
0 986e461dc072 Initial revision glantau parents: diff changeset	5 *
429 718a22dc121f license/copyright change glantau parents: 416 diff changeset	6 * This library is free software; you can redistribute it and/or
718a22dc121f license/copyright change glantau parents: 416 diff changeset	7 * modify it under the terms of the GNU Lesser General Public
718a22dc121f license/copyright change glantau parents: 416 diff changeset	8 * License as published by the Free Software Foundation; either
718a22dc121f license/copyright change glantau parents: 416 diff changeset	9 * version 2 of the License, or (at your option) any later version.
0 986e461dc072 Initial revision glantau parents: diff changeset	10 *
429 718a22dc121f license/copyright change glantau parents: 416 diff changeset	11 * This library is distributed in the hope that it will be useful,
0 986e461dc072 Initial revision glantau parents: diff changeset	12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
429 718a22dc121f license/copyright change glantau parents: 416 diff changeset	13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
718a22dc121f license/copyright change glantau parents: 416 diff changeset	14 * Lesser General Public License for more details.
0 986e461dc072 Initial revision glantau parents: diff changeset	15 *
429 718a22dc121f license/copyright change glantau parents: 416 diff changeset	16 * You should have received a copy of the GNU Lesser General Public
718a22dc121f license/copyright change glantau parents: 416 diff changeset	17 * License along with this library; if not, write to the Free Software
718a22dc121f license/copyright change glantau parents: 416 diff changeset	18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0 986e461dc072 Initial revision glantau parents: diff changeset	19 *
986e461dc072 Initial revision glantau parents: diff changeset	20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
0 986e461dc072 Initial revision glantau parents: diff changeset	22 */
387 b8f3affeb8e1 shared lib support (req by kabi) ... michaelni parents: 386 diff changeset	23
389 f874493a1970 tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests) glantau parents: 387 diff changeset	24 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	25 clobber bug - now it will work with 2.95.2 and also with -fPIC
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	26 */
0 986e461dc072 Initial revision glantau parents: diff changeset	27 static void DEF(put_pixels_x2)(UINT8 block, const UINT8 pixels, int line_size, int h)
986e461dc072 Initial revision glantau parents: diff changeset	28 {
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	29 __asm __volatile(
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	30 "lea (%3, %3), %%eax \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	31 "1: \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	32 "movq (%1), %%mm0 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	33 "movq (%1, %3), %%mm2 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	34 "movq 1(%1), %%mm1 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	35 "movq 1(%1, %3), %%mm3 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	36 "addl %%eax, %1 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	37 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	38 PAVGB" %%mm3, %%mm2 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	39 "movq %%mm0, (%2) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	40 "movq %%mm2, (%2, %3) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	41 "movq (%1), %%mm0 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	42 "movq 1(%1), %%mm1 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	43 "movq (%1, %3), %%mm2 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	44 "movq 1(%1, %3), %%mm3 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	45 "addl %%eax, %2 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	46 "addl %%eax, %1 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	47 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	48 PAVGB" %%mm3, %%mm2 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	49 "movq %%mm0, (%2) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	50 "movq %%mm2, (%2, %3) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	51 "addl %%eax, %2 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	52 "subl $4, %0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	53 " jnz 1b \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	54 :"+g"(h), "+S"(pixels), "+D"(block)
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	55 :"c" (line_size)
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	56 :"%eax", "memory");
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	57 }
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	58
389 f874493a1970 tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests) glantau parents: 387 diff changeset	59 /* GL: this function does incorrect rounding if overflow */
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	60 static void DEF(put_no_rnd_pixels_x2)(UINT8 block, const UINT8 pixels, int line_size, int h)
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	61 {
0 986e461dc072 Initial revision glantau parents: diff changeset	62 __asm __volatile(
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	63 "lea (%3, %3), %%eax \n\t"
387 b8f3affeb8e1 shared lib support (req by kabi) ... michaelni parents: 386 diff changeset	64 MOVQ_BONE(%%mm7)
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	65 "1: \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	66 "movq (%1), %%mm0 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	67 "movq (%1, %3), %%mm2 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	68 "movq 1(%1), %%mm1 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	69 "movq 1(%1, %3), %%mm3 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	70 "addl %%eax, %1 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	71 "psubusb %%mm7, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	72 "psubusb %%mm7, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	73 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	74 PAVGB" %%mm3, %%mm2 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	75 "movq %%mm0, (%2) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	76 "movq %%mm2, (%2, %3) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	77 "movq (%1), %%mm0 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	78 "movq 1(%1), %%mm1 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	79 "movq (%1, %3), %%mm2 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	80 "movq 1(%1, %3), %%mm3 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	81 "addl %%eax, %2 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	82 "addl %%eax, %1 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	83 "psubusb %%mm7, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	84 "psubusb %%mm7, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	85 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	86 PAVGB" %%mm3, %%mm2 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	87 "movq %%mm0, (%2) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	88 "movq %%mm2, (%2, %3) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	89 "addl %%eax, %2 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	90 "subl $4, %0 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	91 "jnz 1b \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	92 :"+g"(h), "+S"(pixels), "+D"(block)
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	93 :"c" (line_size)
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	94 :"%eax", "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	95 }
986e461dc072 Initial revision glantau parents: diff changeset	96
986e461dc072 Initial revision glantau parents: diff changeset	97 static void DEF(put_pixels_y2)(UINT8 block, const UINT8 pixels, int line_size, int h)
986e461dc072 Initial revision glantau parents: diff changeset	98 {
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	99 __asm __volatile(
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	100 "lea (%3, %3), %%eax \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	101 "movq (%1), %%mm0 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	102 "subl %3, %2 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	103 "1: \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	104 "movq (%1, %3), %%mm1 \n\t"
416 ca1f2c0e44ef * fixed contrains and avoid usage of scale index access kabi parents: 414 diff changeset	105 "movq (%1, %%eax), %%mm2 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	106 "addl %%eax, %1 \n\t"
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	107 PAVGB" %%mm1, %%mm0 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	108 PAVGB" %%mm2, %%mm1 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	109 "movq %%mm0, (%2, %3) \n\t"
416 ca1f2c0e44ef * fixed contrains and avoid usage of scale index access kabi parents: 414 diff changeset	110 "movq %%mm1, (%2, %%eax) \n\t"
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	111 "movq (%1, %3), %%mm1 \n\t"
416 ca1f2c0e44ef * fixed contrains and avoid usage of scale index access kabi parents: 414 diff changeset	112 "movq (%1, %%eax), %%mm0 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	113 "addl %%eax, %2 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	114 "addl %%eax, %1 \n\t"
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	115 PAVGB" %%mm1, %%mm2 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	116 PAVGB" %%mm0, %%mm1 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	117 "movq %%mm2, (%2, %3) \n\t"
416 ca1f2c0e44ef * fixed contrains and avoid usage of scale index access kabi parents: 414 diff changeset	118 "movq %%mm1, (%2, %%eax) \n\t"
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	119 "addl %%eax, %2 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	120 "subl $4, %0 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	121 "jnz 1b \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	122 :"+g"(h), "+S"(pixels), "+D" (block)
416 ca1f2c0e44ef * fixed contrains and avoid usage of scale index access kabi parents: 414 diff changeset	123 :"c"(line_size)
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	124 :"%eax", "memory");
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	125 }
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	126
389 f874493a1970 tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests) glantau parents: 387 diff changeset	127 /* GL: this function does incorrect rounding if overflow */
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	128 static void DEF(put_no_rnd_pixels_y2)(UINT8 block, const UINT8 pixels, int line_size, int h)
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	129 {
0 986e461dc072 Initial revision glantau parents: diff changeset	130 __asm __volatile(
387 b8f3affeb8e1 shared lib support (req by kabi) ... michaelni parents: 386 diff changeset	131 MOVQ_BONE(%%mm7)
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	132 "lea (%3, %3), %%eax \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	133 "movq (%1), %%mm0 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	134 "subl %3, %2 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	135 "1: \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	136 "movq (%1, %3), %%mm1 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	137 "movq (%1, %%eax), %%mm2 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	138 "addl %%eax, %1 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	139 "psubusb %%mm7, %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	140 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	141 PAVGB" %%mm2, %%mm1 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	142 "movq %%mm0, (%2, %3) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	143 "movq %%mm1, (%2, %%eax) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	144 "movq (%1, %3), %%mm1 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	145 "movq (%1, %%eax), %%mm0 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	146 "addl %%eax, %2 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	147 "addl %%eax, %1 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	148 "psubusb %%mm7, %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	149 PAVGB" %%mm1, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	150 PAVGB" %%mm0, %%mm1 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	151 "movq %%mm2, (%2, %3) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	152 "movq %%mm1, (%2, %%eax) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	153 "addl %%eax, %2 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	154 "subl $4, %0 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	155 "jnz 1b \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	156 :"+g"(h), "+S"(pixels), "+D" (block)
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	157 :"c"(line_size)
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	158 :"%eax", "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	159 }
986e461dc072 Initial revision glantau parents: diff changeset	160
986e461dc072 Initial revision glantau parents: diff changeset	161 static void DEF(avg_pixels)(UINT8 block, const UINT8 pixels, int line_size, int h)
986e461dc072 Initial revision glantau parents: diff changeset	162 {
986e461dc072 Initial revision glantau parents: diff changeset	163 __asm __volatile(
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	164 "xorl %%eax, %%eax \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	165 ".balign 16 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	166 "1: \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	167 "movq (%1, %%eax), %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	168 "movq (%2, %%eax), %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	169 "movq (%3, %%eax), %%mm3 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	170 "movq (%4, %%eax), %%mm4 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	171 PAVGB" %%mm3, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	172 PAVGB" %%mm4, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	173 "movq %%mm0, (%3, %%eax) \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	174 "movq %%mm2, (%4, %%eax) \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	175 "addl %5, %%eax \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	176 "movq (%1, %%eax), %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	177 "movq (%2, %%eax), %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	178 "movq (%3, %%eax), %%mm3 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	179 "movq (%4, %%eax), %%mm4 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	180 PAVGB" %%mm3, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	181 PAVGB" %%mm4, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	182 "movq %%mm0, (%3, %%eax) \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	183 "movq %%mm2, (%4, %%eax) \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	184 "addl %5, %%eax \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	185 "subl $4, %0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	186 " jnz 1b \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	187 :"+g"(h)
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	188 :"D"(pixels), "S"(pixels+line_size), "r" (block), "r" (block+line_size),
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	189 "g"(line_size<<1)
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	190 :"%eax", "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	191 }
986e461dc072 Initial revision glantau parents: diff changeset	192
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	193 static void DEF(avg_pixels_x2)(UINT8 block, const UINT8 pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	194 {
986e461dc072 Initial revision glantau parents: diff changeset	195 __asm __volatile(
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	196 "xorl %%eax, %%eax \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	197 ".balign 16 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	198 "1: \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	199 "movq (%1, %%eax), %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	200 "movq 1(%1, %%eax), %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	201 "movq (%2, %%eax), %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	202 "movq 1(%2, %%eax), %%mm3 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	203 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	204 PAVGB" %%mm3, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	205 "movq (%3, %%eax), %%mm3 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	206 "movq (%4, %%eax), %%mm4 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	207 PAVGB" %%mm3, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	208 PAVGB" %%mm4, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	209 "movq %%mm0, (%3, %%eax) \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	210 "movq %%mm2, (%4, %%eax) \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	211 "addl %5, %%eax \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	212 "movq (%1, %%eax), %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	213 "movq 1(%1, %%eax), %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	214 "movq (%2, %%eax), %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	215 "movq 1(%2, %%eax), %%mm3 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	216 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	217 PAVGB" %%mm3, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	218 "movq (%3, %%eax), %%mm3 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	219 "movq (%4, %%eax), %%mm4 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	220 PAVGB" %%mm3, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	221 PAVGB" %%mm4, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	222 "movq %%mm0, (%3, %%eax) \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	223 "movq %%mm2, (%4, %%eax) \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	224 "addl %5, %%eax \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	225 "subl $4, %0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	226 " jnz 1b \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	227 :"+g"(h)
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	228 :"D"(pixels), "S"(pixels+line_size), "r" (block), "r" (block+line_size),
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	229 "g"(line_size<<1)
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	230 :"%eax", "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	231 }
986e461dc072 Initial revision glantau parents: diff changeset	232
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	233 static void DEF(avg_pixels_y2)(UINT8 block, const UINT8 pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	234 {
986e461dc072 Initial revision glantau parents: diff changeset	235 __asm __volatile(
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	236 "xorl %%eax, %%eax \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	237 "movq (%1), %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	238 ".balign 16 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	239 "1: \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	240 "movq (%2, %%eax), %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	241 "movq (%3, %%eax), %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	242 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	243 PAVGB" %%mm2, %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	244 "movq (%4, %%eax), %%mm3 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	245 "movq (%5, %%eax), %%mm4 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	246 PAVGB" %%mm3, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	247 PAVGB" %%mm4, %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	248 "movq %%mm0, (%4, %%eax) \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	249 "movq %%mm1, (%5, %%eax) \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	250 "addl %6, %%eax \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	251 "movq (%2, %%eax), %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	252 "movq (%3, %%eax), %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	253 PAVGB" %%mm1, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	254 PAVGB" %%mm0, %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	255 "movq (%4, %%eax), %%mm3 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	256 "movq (%5, %%eax), %%mm4 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	257 PAVGB" %%mm3, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	258 PAVGB" %%mm4, %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	259 "movq %%mm2, (%4, %%eax) \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	260 "movq %%mm1, (%5, %%eax) \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	261 "addl %6, %%eax \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	262 "subl $4, %0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	263 " jnz 1b \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	264 :"+g"(h)
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	265 :"D"(pixels), "S"(pixels+line_size), "r"(pixels+line_size*2), "r" (block),
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	266 "r" (block+line_size), "g"(line_size<<1)
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	267 :"%eax", "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	268 }
986e461dc072 Initial revision glantau parents: diff changeset	269
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	270 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	271 static void DEF(avg_pixels_xy2)(UINT8 block, const UINT8 pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	272 {
986e461dc072 Initial revision glantau parents: diff changeset	273 __asm __volatile(
387 b8f3affeb8e1 shared lib support (req by kabi) ... michaelni parents: 386 diff changeset	274 MOVQ_BONE(%%mm7)
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	275 "xorl %%eax, %%eax \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	276 "movq (%1), %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	277 "movq 1(%1), %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	278 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	279 ".balign 16 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	280 "1: \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	281 "movq (%2, %%eax), %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	282 "movq (%3, %%eax), %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	283 "movq 1(%2, %%eax), %%mm3 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	284 "movq 1(%3, %%eax), %%mm4 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	285 "psubusb %%mm7, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	286 PAVGB" %%mm3, %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	287 PAVGB" %%mm4, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	288 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	289 PAVGB" %%mm2, %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	290 "movq (%4, %%eax), %%mm3 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	291 "movq (%5, %%eax), %%mm4 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	292 PAVGB" %%mm3, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	293 PAVGB" %%mm4, %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	294 "movq %%mm0, (%4, %%eax) \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	295 "movq %%mm1, (%5, %%eax) \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	296 "addl %6, %%eax \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	297 "movq (%2, %%eax), %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	298 "movq (%3, %%eax), %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	299 "movq 1(%2, %%eax), %%mm3 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	300 "movq 1(%3, %%eax), %%mm4 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	301 PAVGB" %%mm3, %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	302 PAVGB" %%mm4, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	303 PAVGB" %%mm1, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	304 PAVGB" %%mm0, %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	305 "movq (%4, %%eax), %%mm3 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	306 "movq (%5, %%eax), %%mm4 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	307 PAVGB" %%mm3, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	308 PAVGB" %%mm4, %%mm1 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	309 "movq %%mm2, (%4, %%eax) \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	310 "movq %%mm1, (%5, %%eax) \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	311 "addl %6, %%eax \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	312 "subl $4, %0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	313 " jnz 1b \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	314 :"+g"(h)
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	315 :"D"(pixels), "S"(pixels+line_size), "r"(pixels+line_size*2), "r" (block),
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	316 "r" (block+line_size), "g"(line_size<<1)
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	317 :"%eax", "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	318 }

Mercurial > libavcodec.hg

annotate i386/dsputil_mmx_avg.h @ 439:6ae275655a23 libavcodec