libavcodec.hg: i386/dsputil_mmx

annotate i386/dsputil_mmx_avg.h @ 2367:c353719836af libavcodec

fix some type mismatches patch by (Jeff Muizelaar <muizelaar rogers com>)

author	michael
date	Thu, 25 Nov 2004 19:17:27 +0000
parents	15cfba1b97b5
children	00f608ae3791

rev	line source
0 986e461dc072 Initial revision glantau parents: diff changeset	1 /*
986e461dc072 Initial revision glantau parents: diff changeset	2 * DSP utils : average functions are compiled twice for 3dnow/mmx2
429 718a22dc121f license/copyright change glantau parents: 416 diff changeset	3 * Copyright (c) 2000, 2001 Fabrice Bellard.
1739 07a484280a82 copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise michael parents: 1064 diff changeset	4 * Copyright (c) 2002-2004 Michael Niedermayer
0 986e461dc072 Initial revision glantau parents: diff changeset	5 *
429 718a22dc121f license/copyright change glantau parents: 416 diff changeset	6 * This library is free software; you can redistribute it and/or
718a22dc121f license/copyright change glantau parents: 416 diff changeset	7 * modify it under the terms of the GNU Lesser General Public
718a22dc121f license/copyright change glantau parents: 416 diff changeset	8 * License as published by the Free Software Foundation; either
718a22dc121f license/copyright change glantau parents: 416 diff changeset	9 * version 2 of the License, or (at your option) any later version.
0 986e461dc072 Initial revision glantau parents: diff changeset	10 *
429 718a22dc121f license/copyright change glantau parents: 416 diff changeset	11 * This library is distributed in the hope that it will be useful,
0 986e461dc072 Initial revision glantau parents: diff changeset	12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
429 718a22dc121f license/copyright change glantau parents: 416 diff changeset	13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
718a22dc121f license/copyright change glantau parents: 416 diff changeset	14 * Lesser General Public License for more details.
0 986e461dc072 Initial revision glantau parents: diff changeset	15 *
429 718a22dc121f license/copyright change glantau parents: 416 diff changeset	16 * You should have received a copy of the GNU Lesser General Public
718a22dc121f license/copyright change glantau parents: 416 diff changeset	17 * License along with this library; if not, write to the Free Software
718a22dc121f license/copyright change glantau parents: 416 diff changeset	18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0 986e461dc072 Initial revision glantau parents: diff changeset	19 *
986e461dc072 Initial revision glantau parents: diff changeset	20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	22 * and improved by Zdenek Kabelac <kabi@users.sf.net>
0 986e461dc072 Initial revision glantau parents: diff changeset	23 */
387 b8f3affeb8e1 shared lib support (req by kabi) ... michaelni parents: 386 diff changeset	24
389 f874493a1970 tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests) glantau parents: 387 diff changeset	25 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	26 clobber bug - now it will work with 2.95.2 and also with -fPIC
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	27 */
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	28 static void DEF(put_pixels8_x2)(uint8_t block, const uint8_t pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	29 {
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	30 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	31 "lea (%3, %3), %%"REG_a" \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	32 "1: \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	33 "movq (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	34 "movq (%1, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	35 PAVGB" 1(%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	36 PAVGB" 1(%1, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	37 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	38 "movq %%mm1, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	39 "add %%"REG_a", %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	40 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	41 "movq (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	42 "movq (%1, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	43 PAVGB" 1(%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	44 PAVGB" 1(%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	45 "add %%"REG_a", %1 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	46 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	47 "movq %%mm1, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	48 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	49 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	50 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	51 :"+g"(h), "+S"(pixels), "+D"(block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	52 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	53 :"%"REG_a, "memory");
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	54 }
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	55
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	56 static void DEF(put_pixels4_l2)(uint8_t dst, uint8_t src1, uint8_t *src2, int dstStride, int src1Stride, int h)
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	57 {
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	58 __asm __volatile(
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	59 "testl $1, %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	60 " jz 1f \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	61 "movd (%1), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	62 "movd (%2), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	63 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	64 "add $4, %2 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	65 PAVGB" %%mm1, %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	66 "movd %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	67 "add %5, %3 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	68 "decl %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	69 "1: \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	70 "movd (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	71 "add %4, %1 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	72 "movd (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	73 "add %4, %1 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	74 PAVGB" (%2), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	75 PAVGB" 4(%2), %%mm1 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	76 "movd %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	77 "add %5, %3 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	78 "movd %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	79 "add %5, %3 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	80 "movd (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	81 "add %4, %1 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	82 "movd (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	83 "add %4, %1 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	84 PAVGB" 8(%2), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	85 PAVGB" 12(%2), %%mm1 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	86 "movd %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	87 "add %5, %3 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	88 "movd %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	89 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	90 "add $16, %2 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	91 "subl $4, %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	92 "jnz 1b \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	93 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	94 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	95 #else
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	96 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	97 #endif
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	98 :"S"((long)src1Stride), "D"((long)dstStride)
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	99 :"memory");
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	100 }
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	101
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	102
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	103 static void DEF(put_pixels8_l2)(uint8_t dst, uint8_t src1, uint8_t *src2, int dstStride, int src1Stride, int h)
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	104 {
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	105 __asm __volatile(
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	106 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	107 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	108 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	109 "movq (%2), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	110 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	111 "add $8, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	112 PAVGB" %%mm1, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	113 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	114 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	115 "decl %0 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	116 "1: \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	117 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	118 "add %4, %1 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	119 "movq (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	120 "add %4, %1 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	121 PAVGB" (%2), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	122 PAVGB" 8(%2), %%mm1 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	123 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	124 "add %5, %3 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	125 "movq %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	126 "add %5, %3 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	127 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	128 "add %4, %1 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	129 "movq (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	130 "add %4, %1 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	131 PAVGB" 16(%2), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	132 PAVGB" 24(%2), %%mm1 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	133 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	134 "add %5, %3 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	135 "movq %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	136 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	137 "add $32, %2 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	138 "subl $4, %0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	139 "jnz 1b \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	140 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	141 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	142 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	143 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	144 #endif
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	145 :"S"((long)src1Stride), "D"((long)dstStride)
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	146 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	147 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	148 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	149 :"r"(src1Stride), "r"(dstStride)
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	150 :"memory");*/
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	151 }
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	152
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	153 static void DEF(put_no_rnd_pixels8_l2)(uint8_t dst, uint8_t src1, uint8_t *src2, int dstStride, int src1Stride, int h)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	154 {
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	155 __asm __volatile(
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	156 "pcmpeqb %%mm6, %%mm6 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	157 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	158 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	159 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	160 "movq (%2), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	161 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	162 "add $8, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	163 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	164 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	165 PAVGB" %%mm1, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	166 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	167 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	168 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	169 "decl %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	170 "1: \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	171 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	172 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	173 "movq (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	174 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	175 "movq (%2), %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	176 "movq 8(%2), %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	177 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	178 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	179 "pxor %%mm6, %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	180 "pxor %%mm6, %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	181 PAVGB" %%mm2, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	182 PAVGB" %%mm3, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	183 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	184 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	185 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	186 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	187 "movq %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	188 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	189 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	190 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	191 "movq (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	192 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	193 "movq 16(%2), %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	194 "movq 24(%2), %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	195 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	196 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	197 "pxor %%mm6, %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	198 "pxor %%mm6, %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	199 PAVGB" %%mm2, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	200 PAVGB" %%mm3, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	201 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	202 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	203 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	204 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	205 "movq %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	206 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	207 "add $32, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	208 "subl $4, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	209 "jnz 1b \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	210 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	211 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	212 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	213 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	214 #endif
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	215 :"S"((long)src1Stride), "D"((long)dstStride)
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	216 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	217 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	218 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	219 :"r"(src1Stride), "r"(dstStride)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	220 :"memory");*/
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	221 }
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	222
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	223 static void DEF(avg_pixels4_l2)(uint8_t dst, uint8_t src1, uint8_t *src2, int dstStride, int src1Stride, int h)
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	224 {
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	225 __asm __volatile(
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	226 "testl $1, %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	227 " jz 1f \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	228 "movd (%1), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	229 "movd (%2), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	230 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	231 "add $4, %2 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	232 PAVGB" %%mm1, %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	233 PAVGB" (%3), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	234 "movd %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	235 "add %5, %3 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	236 "decl %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	237 "1: \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	238 "movd (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	239 "add %4, %1 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	240 "movd (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	241 "add %4, %1 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	242 PAVGB" (%2), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	243 PAVGB" 4(%2), %%mm1 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	244 PAVGB" (%3), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	245 "movd %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	246 "add %5, %3 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	247 PAVGB" (%3), %%mm1 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	248 "movd %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	249 "add %5, %3 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	250 "movd (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	251 "add %4, %1 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	252 "movd (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	253 "add %4, %1 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	254 PAVGB" 8(%2), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	255 PAVGB" 12(%2), %%mm1 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	256 PAVGB" (%3), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	257 "movd %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	258 "add %5, %3 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	259 PAVGB" (%3), %%mm1 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	260 "movd %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	261 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	262 "add $16, %2 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	263 "subl $4, %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	264 "jnz 1b \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	265 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	266 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	267 #else
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	268 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	269 #endif
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	270 :"S"((long)src1Stride), "D"((long)dstStride)
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	271 :"memory");
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	272 }
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	273
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	274
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	275 static void DEF(avg_pixels8_l2)(uint8_t dst, uint8_t src1, uint8_t *src2, int dstStride, int src1Stride, int h)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	276 {
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	277 __asm __volatile(
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	278 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	279 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	280 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	281 "movq (%2), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	282 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	283 "add $8, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	284 PAVGB" %%mm1, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	285 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	286 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	287 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	288 "decl %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	289 "1: \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	290 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	291 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	292 "movq (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	293 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	294 PAVGB" (%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	295 PAVGB" 8(%2), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	296 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	297 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	298 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	299 PAVGB" (%3), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	300 "movq %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	301 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	302 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	303 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	304 "movq (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	305 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	306 PAVGB" 16(%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	307 PAVGB" 24(%2), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	308 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	309 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	310 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	311 PAVGB" (%3), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	312 "movq %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	313 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	314 "add $32, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	315 "subl $4, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	316 "jnz 1b \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	317 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	318 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	319 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	320 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	321 #endif
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	322 :"S"((long)src1Stride), "D"((long)dstStride)
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	323 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	324 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	325 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	326 :"r"(src1Stride), "r"(dstStride)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	327 :"memory");*/
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	328 }
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	329
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	330 static void DEF(put_pixels16_x2)(uint8_t block, const uint8_t pixels, int line_size, int h)
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	331 {
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	332 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	333 "lea (%3, %3), %%"REG_a" \n\t"
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	334 "1: \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	335 "movq (%1), %%mm0 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	336 "movq (%1, %3), %%mm1 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	337 "movq 8(%1), %%mm2 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	338 "movq 8(%1, %3), %%mm3 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	339 PAVGB" 1(%1), %%mm0 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	340 PAVGB" 1(%1, %3), %%mm1 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	341 PAVGB" 9(%1), %%mm2 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	342 PAVGB" 9(%1, %3), %%mm3 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	343 "movq %%mm0, (%2) \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	344 "movq %%mm1, (%2, %3) \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	345 "movq %%mm2, 8(%2) \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	346 "movq %%mm3, 8(%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	347 "add %%"REG_a", %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	348 "add %%"REG_a", %2 \n\t"
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	349 "movq (%1), %%mm0 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	350 "movq (%1, %3), %%mm1 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	351 "movq 8(%1), %%mm2 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	352 "movq 8(%1, %3), %%mm3 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	353 PAVGB" 1(%1), %%mm0 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	354 PAVGB" 1(%1, %3), %%mm1 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	355 PAVGB" 9(%1), %%mm2 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	356 PAVGB" 9(%1, %3), %%mm3 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	357 "add %%"REG_a", %1 \n\t"
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	358 "movq %%mm0, (%2) \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	359 "movq %%mm1, (%2, %3) \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	360 "movq %%mm2, 8(%2) \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	361 "movq %%mm3, 8(%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	362 "add %%"REG_a", %2 \n\t"
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	363 "subl $4, %0 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	364 "jnz 1b \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	365 :"+g"(h), "+S"(pixels), "+D"(block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	366 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	367 :"%"REG_a, "memory");
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	368 }
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	369
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	370 static void DEF(put_pixels16_l2)(uint8_t dst, uint8_t src1, uint8_t *src2, int dstStride, int src1Stride, int h)
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	371 {
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	372 __asm __volatile(
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	373 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	374 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	375 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	376 "movq 8(%1), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	377 PAVGB" (%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	378 PAVGB" 8(%2), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	379 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	380 "add $16, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	381 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	382 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	383 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	384 "decl %0 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	385 "1: \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	386 "movq (%1), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	387 "movq 8(%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	388 "add %4, %1 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	389 PAVGB" (%2), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	390 PAVGB" 8(%2), %%mm1 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	391 "movq %%mm0, (%3) \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	392 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	393 "add %5, %3 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	394 "movq (%1), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	395 "movq 8(%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	396 "add %4, %1 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	397 PAVGB" 16(%2), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	398 PAVGB" 24(%2), %%mm1 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	399 "movq %%mm0, (%3) \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	400 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	401 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	402 "add $32, %2 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	403 "subl $2, %0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	404 "jnz 1b \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	405 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	406 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	407 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	408 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	409 #endif
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	410 :"S"((long)src1Stride), "D"((long)dstStride)
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	411 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	412 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	413 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	414 :"r"(src1Stride), "r"(dstStride)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	415 :"memory");*/
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	416 }
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	417
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	418 static void DEF(avg_pixels16_l2)(uint8_t dst, uint8_t src1, uint8_t *src2, int dstStride, int src1Stride, int h)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	419 {
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	420 __asm __volatile(
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	421 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	422 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	423 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	424 "movq 8(%1), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	425 PAVGB" (%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	426 PAVGB" 8(%2), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	427 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	428 "add $16, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	429 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	430 PAVGB" 8(%3), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	431 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	432 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	433 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	434 "decl %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	435 "1: \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	436 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	437 "movq 8(%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	438 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	439 PAVGB" (%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	440 PAVGB" 8(%2), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	441 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	442 PAVGB" 8(%3), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	443 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	444 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	445 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	446 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	447 "movq 8(%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	448 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	449 PAVGB" 16(%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	450 PAVGB" 24(%2), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	451 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	452 PAVGB" 8(%3), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	453 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	454 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	455 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	456 "add $32, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	457 "subl $2, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	458 "jnz 1b \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	459 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	460 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	461 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	462 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	463 #endif
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	464 :"S"((long)src1Stride), "D"((long)dstStride)
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	465 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	466 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	467 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	468 :"r"(src1Stride), "r"(dstStride)
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	469 :"memory");*/
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	470 }
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	471
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	472 static void DEF(put_no_rnd_pixels16_l2)(uint8_t dst, uint8_t src1, uint8_t *src2, int dstStride, int src1Stride, int h)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	473 {
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	474 __asm __volatile(
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	475 "pcmpeqb %%mm6, %%mm6\n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	476 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	477 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	478 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	479 "movq 8(%1), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	480 "movq (%2), %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	481 "movq 8(%2), %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	482 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	483 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	484 "pxor %%mm6, %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	485 "pxor %%mm6, %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	486 PAVGB" %%mm2, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	487 PAVGB" %%mm3, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	488 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	489 "pxor %%mm6, %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	490 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	491 "add $16, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	492 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	493 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	494 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	495 "decl %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	496 "1: \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	497 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	498 "movq 8(%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	499 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	500 "movq (%2), %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	501 "movq 8(%2), %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	502 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	503 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	504 "pxor %%mm6, %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	505 "pxor %%mm6, %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	506 PAVGB" %%mm2, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	507 PAVGB" %%mm3, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	508 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	509 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	510 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	511 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	512 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	513 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	514 "movq 8(%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	515 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	516 "movq 16(%2), %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	517 "movq 24(%2), %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	518 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	519 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	520 "pxor %%mm6, %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	521 "pxor %%mm6, %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	522 PAVGB" %%mm2, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	523 PAVGB" %%mm3, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	524 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	525 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	526 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	527 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	528 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	529 "add $32, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	530 "subl $2, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	531 "jnz 1b \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	532 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	533 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	534 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	535 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	536 #endif
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	537 :"S"((long)src1Stride), "D"((long)dstStride)
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	538 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	539 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	540 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	541 :"r"(src1Stride), "r"(dstStride)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	542 :"memory");*/
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	543 }
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	544
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	545 /* GL: this function does incorrect rounding if overflow */
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	546 static void DEF(put_no_rnd_pixels8_x2)(uint8_t block, const uint8_t pixels, int line_size, int h)
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	547 {
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	548 MOVQ_BONE(mm6);
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	549 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	550 "lea (%3, %3), %%"REG_a" \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	551 "1: \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	552 "movq (%1), %%mm0 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	553 "movq (%1, %3), %%mm2 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	554 "movq 1(%1), %%mm1 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	555 "movq 1(%1, %3), %%mm3 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	556 "add %%"REG_a", %1 \n\t"
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	557 "psubusb %%mm6, %%mm0 \n\t"
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	558 "psubusb %%mm6, %%mm2 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	559 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	560 PAVGB" %%mm3, %%mm2 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	561 "movq %%mm0, (%2) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	562 "movq %%mm2, (%2, %3) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	563 "movq (%1), %%mm0 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	564 "movq 1(%1), %%mm1 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	565 "movq (%1, %3), %%mm2 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	566 "movq 1(%1, %3), %%mm3 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	567 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	568 "add %%"REG_a", %1 \n\t"
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	569 "psubusb %%mm6, %%mm0 \n\t"
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	570 "psubusb %%mm6, %%mm2 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	571 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	572 PAVGB" %%mm3, %%mm2 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	573 "movq %%mm0, (%2) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	574 "movq %%mm2, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	575 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	576 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	577 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	578 :"+g"(h), "+S"(pixels), "+D"(block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	579 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	580 :"%"REG_a, "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	581 }
986e461dc072 Initial revision glantau parents: diff changeset	582
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	583 static void DEF(put_pixels8_y2)(uint8_t block, const uint8_t pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	584 {
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	585 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	586 "lea (%3, %3), %%"REG_a" \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	587 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	588 "sub %3, %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	589 "1: \n\t"
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	590 "movq (%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	591 "movq (%1, %%"REG_a"), %%mm2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	592 "add %%"REG_a", %1 \n\t"
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	593 PAVGB" %%mm1, %%mm0 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	594 PAVGB" %%mm2, %%mm1 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	595 "movq %%mm0, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	596 "movq %%mm1, (%2, %%"REG_a") \n\t"
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	597 "movq (%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	598 "movq (%1, %%"REG_a"), %%mm0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	599 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	600 "add %%"REG_a", %1 \n\t"
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	601 PAVGB" %%mm1, %%mm2 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	602 PAVGB" %%mm0, %%mm1 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	603 "movq %%mm2, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	604 "movq %%mm1, (%2, %%"REG_a") \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	605 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	606 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	607 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	608 :"+g"(h), "+S"(pixels), "+D" (block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	609 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	610 :"%"REG_a, "memory");
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	611 }
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	612
389 f874493a1970 tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests) glantau parents: 387 diff changeset	613 /* GL: this function does incorrect rounding if overflow */
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	614 static void DEF(put_no_rnd_pixels8_y2)(uint8_t block, const uint8_t pixels, int line_size, int h)
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	615 {
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	616 MOVQ_BONE(mm6);
0 986e461dc072 Initial revision glantau parents: diff changeset	617 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	618 "lea (%3, %3), %%"REG_a" \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	619 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	620 "sub %3, %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	621 "1: \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	622 "movq (%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	623 "movq (%1, %%"REG_a"), %%mm2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	624 "add %%"REG_a", %1 \n\t"
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	625 "psubusb %%mm6, %%mm1 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	626 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	627 PAVGB" %%mm2, %%mm1 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	628 "movq %%mm0, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	629 "movq %%mm1, (%2, %%"REG_a") \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	630 "movq (%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	631 "movq (%1, %%"REG_a"), %%mm0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	632 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	633 "add %%"REG_a", %1 \n\t"
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	634 "psubusb %%mm6, %%mm1 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	635 PAVGB" %%mm1, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	636 PAVGB" %%mm0, %%mm1 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	637 "movq %%mm2, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	638 "movq %%mm1, (%2, %%"REG_a") \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	639 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	640 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	641 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	642 :"+g"(h), "+S"(pixels), "+D" (block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	643 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	644 :"%"REG_a, "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	645 }
986e461dc072 Initial revision glantau parents: diff changeset	646
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	647 static void DEF(avg_pixels8)(uint8_t block, const uint8_t pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	648 {
986e461dc072 Initial revision glantau parents: diff changeset	649 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	650 "lea (%3, %3), %%"REG_a" \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	651 "1: \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	652 "movq (%2), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	653 "movq (%2, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	654 PAVGB" (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	655 PAVGB" (%1, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	656 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	657 "movq %%mm1, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	658 "add %%"REG_a", %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	659 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	660 "movq (%2), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	661 "movq (%2, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	662 PAVGB" (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	663 PAVGB" (%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	664 "add %%"REG_a", %1 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	665 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	666 "movq %%mm1, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	667 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	668 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	669 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	670 :"+g"(h), "+S"(pixels), "+D"(block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	671 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	672 :"%"REG_a, "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	673 }
986e461dc072 Initial revision glantau parents: diff changeset	674
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	675 static void DEF(avg_pixels8_x2)(uint8_t block, const uint8_t pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	676 {
986e461dc072 Initial revision glantau parents: diff changeset	677 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	678 "lea (%3, %3), %%"REG_a" \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	679 "1: \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	680 "movq (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	681 "movq (%1, %3), %%mm2 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	682 PAVGB" 1(%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	683 PAVGB" 1(%1, %3), %%mm2 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	684 PAVGB" (%2), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	685 PAVGB" (%2, %3), %%mm2 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	686 "add %%"REG_a", %1 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	687 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	688 "movq %%mm2, (%2, %3) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	689 "movq (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	690 "movq (%1, %3), %%mm2 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	691 PAVGB" 1(%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	692 PAVGB" 1(%1, %3), %%mm2 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	693 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	694 "add %%"REG_a", %1 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	695 PAVGB" (%2), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	696 PAVGB" (%2, %3), %%mm2 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	697 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	698 "movq %%mm2, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	699 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	700 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	701 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	702 :"+g"(h), "+S"(pixels), "+D"(block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	703 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	704 :"%"REG_a, "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	705 }
986e461dc072 Initial revision glantau parents: diff changeset	706
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	707 static void DEF(avg_pixels8_y2)(uint8_t block, const uint8_t pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	708 {
986e461dc072 Initial revision glantau parents: diff changeset	709 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	710 "lea (%3, %3), %%"REG_a" \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	711 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	712 "sub %3, %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	713 "1: \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	714 "movq (%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	715 "movq (%1, %%"REG_a"), %%mm2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	716 "add %%"REG_a", %1 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	717 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	718 PAVGB" %%mm2, %%mm1 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	719 "movq (%2, %3), %%mm3 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	720 "movq (%2, %%"REG_a"), %%mm4 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	721 PAVGB" %%mm3, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	722 PAVGB" %%mm4, %%mm1 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	723 "movq %%mm0, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	724 "movq %%mm1, (%2, %%"REG_a") \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	725 "movq (%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	726 "movq (%1, %%"REG_a"), %%mm0 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	727 PAVGB" %%mm1, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	728 PAVGB" %%mm0, %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	729 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	730 "add %%"REG_a", %1 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	731 "movq (%2, %3), %%mm3 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	732 "movq (%2, %%"REG_a"), %%mm4 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	733 PAVGB" %%mm3, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	734 PAVGB" %%mm4, %%mm1 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	735 "movq %%mm2, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	736 "movq %%mm1, (%2, %%"REG_a") \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	737 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	738 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	739 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	740 :"+g"(h), "+S"(pixels), "+D"(block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	741 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	742 :"%"REG_a, "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	743 }
986e461dc072 Initial revision glantau parents: diff changeset	744
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	745 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	746 static void DEF(avg_pixels8_xy2)(uint8_t block, const uint8_t pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	747 {
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	748 MOVQ_BONE(mm6);
0 986e461dc072 Initial revision glantau parents: diff changeset	749 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	750 "lea (%3, %3), %%"REG_a" \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	751 "movq (%1), %%mm0 \n\t"
442 006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	752 PAVGB" 1(%1), %%mm0 \n\t"
006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	753 ".balign 8 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	754 "1: \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	755 "movq (%1, %%"REG_a"), %%mm2 \n\t"
442 006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	756 "movq (%1, %3), %%mm1 \n\t"
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	757 "psubusb %%mm6, %%mm2 \n\t"
442 006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	758 PAVGB" 1(%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	759 PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	760 "add %%"REG_a", %1 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	761 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	762 PAVGB" %%mm2, %%mm1 \n\t"
442 006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	763 PAVGB" (%2), %%mm0 \n\t"
006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	764 PAVGB" (%2, %3), %%mm1 \n\t"
006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	765 "movq %%mm0, (%2) \n\t"
006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	766 "movq %%mm1, (%2, %3) \n\t"
006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	767 "movq (%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	768 "movq (%1, %%"REG_a"), %%mm0 \n\t"
442 006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	769 PAVGB" 1(%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	770 PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	771 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	772 "add %%"REG_a", %1 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	773 PAVGB" %%mm1, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	774 PAVGB" %%mm0, %%mm1 \n\t"
442 006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	775 PAVGB" (%2), %%mm2 \n\t"
006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	776 PAVGB" (%2, %3), %%mm1 \n\t"
006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	777 "movq %%mm2, (%2) \n\t"
006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	778 "movq %%mm1, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	779 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	780 "subl $4, %0 \n\t"
442 006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	781 "jnz 1b \n\t"
443 63467327c06c * cosmetic minor change kabi parents: 442 diff changeset	782 :"+g"(h), "+S"(pixels), "+D"(block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	783 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	784 :"%"REG_a, "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	785 }
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	786
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	787 //FIXME the following could be optimized too ...
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	788 static void DEF(put_no_rnd_pixels16_x2)(uint8_t block, const uint8_t pixels, int line_size, int h){
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	789 DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	790 DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	791 }
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	792 static void DEF(put_pixels16_y2)(uint8_t block, const uint8_t pixels, int line_size, int h){
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	793 DEF(put_pixels8_y2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	794 DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	795 }
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	796 static void DEF(put_no_rnd_pixels16_y2)(uint8_t block, const uint8_t pixels, int line_size, int h){
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	797 DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	798 DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	799 }
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	800 static void DEF(avg_pixels16)(uint8_t block, const uint8_t pixels, int line_size, int h){
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	801 DEF(avg_pixels8)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	802 DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	803 }
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	804 static void DEF(avg_pixels16_x2)(uint8_t block, const uint8_t pixels, int line_size, int h){
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	805 DEF(avg_pixels8_x2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	806 DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	807 }
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	808 static void DEF(avg_pixels16_y2)(uint8_t block, const uint8_t pixels, int line_size, int h){
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	809 DEF(avg_pixels8_y2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	810 DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	811 }
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	812 static void DEF(avg_pixels16_xy2)(uint8_t block, const uint8_t pixels, int line_size, int h){
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	813 DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	814 DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	815 }
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	816

Mercurial > libavcodec.hg

annotate i386/dsputil_mmx_avg.h @ 2367:c353719836af libavcodec