libavcodec.hg: i386/dsputil_mmx

annotate i386/dsputil_mmx_avg.h @ 2662:2fe9599170f6 libavcodec

typo

author	michael
date	Tue, 10 May 2005 21:02:20 +0000
parents	00f608ae3791
children	ef2149182f1c

rev	line source
0 986e461dc072 Initial revision glantau parents: diff changeset	1 /*
986e461dc072 Initial revision glantau parents: diff changeset	2 * DSP utils : average functions are compiled twice for 3dnow/mmx2
429 718a22dc121f license/copyright change glantau parents: 416 diff changeset	3 * Copyright (c) 2000, 2001 Fabrice Bellard.
1739 07a484280a82 copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise michael parents: 1064 diff changeset	4 * Copyright (c) 2002-2004 Michael Niedermayer
0 986e461dc072 Initial revision glantau parents: diff changeset	5 *
429 718a22dc121f license/copyright change glantau parents: 416 diff changeset	6 * This library is free software; you can redistribute it and/or
718a22dc121f license/copyright change glantau parents: 416 diff changeset	7 * modify it under the terms of the GNU Lesser General Public
718a22dc121f license/copyright change glantau parents: 416 diff changeset	8 * License as published by the Free Software Foundation; either
718a22dc121f license/copyright change glantau parents: 416 diff changeset	9 * version 2 of the License, or (at your option) any later version.
0 986e461dc072 Initial revision glantau parents: diff changeset	10 *
429 718a22dc121f license/copyright change glantau parents: 416 diff changeset	11 * This library is distributed in the hope that it will be useful,
0 986e461dc072 Initial revision glantau parents: diff changeset	12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
429 718a22dc121f license/copyright change glantau parents: 416 diff changeset	13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
718a22dc121f license/copyright change glantau parents: 416 diff changeset	14 * Lesser General Public License for more details.
0 986e461dc072 Initial revision glantau parents: diff changeset	15 *
429 718a22dc121f license/copyright change glantau parents: 416 diff changeset	16 * You should have received a copy of the GNU Lesser General Public
718a22dc121f license/copyright change glantau parents: 416 diff changeset	17 * License along with this library; if not, write to the Free Software
718a22dc121f license/copyright change glantau parents: 416 diff changeset	18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0 986e461dc072 Initial revision glantau parents: diff changeset	19 *
986e461dc072 Initial revision glantau parents: diff changeset	20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	22 * and improved by Zdenek Kabelac <kabi@users.sf.net>
0 986e461dc072 Initial revision glantau parents: diff changeset	23 */
387 b8f3affeb8e1 shared lib support (req by kabi) ... michaelni parents: 386 diff changeset	24
389 f874493a1970 tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests) glantau parents: 387 diff changeset	25 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	26 clobber bug - now it will work with 2.95.2 and also with -fPIC
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	27 */
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	28 static void DEF(put_pixels8_x2)(uint8_t block, const uint8_t pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	29 {
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	30 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	31 "lea (%3, %3), %%"REG_a" \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	32 "1: \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	33 "movq (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	34 "movq (%1, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	35 PAVGB" 1(%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	36 PAVGB" 1(%1, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	37 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	38 "movq %%mm1, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	39 "add %%"REG_a", %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	40 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	41 "movq (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	42 "movq (%1, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	43 PAVGB" 1(%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	44 PAVGB" 1(%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	45 "add %%"REG_a", %1 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	46 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	47 "movq %%mm1, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	48 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	49 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	50 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	51 :"+g"(h), "+S"(pixels), "+D"(block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	52 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	53 :"%"REG_a, "memory");
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	54 }
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	55
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	56 static void DEF(put_pixels4_l2)(uint8_t dst, uint8_t src1, uint8_t *src2, int dstStride, int src1Stride, int h)
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	57 {
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	58 __asm __volatile(
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	59 "testl $1, %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	60 " jz 1f \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	61 "movd (%1), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	62 "movd (%2), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	63 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	64 "add $4, %2 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	65 PAVGB" %%mm1, %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	66 "movd %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	67 "add %5, %3 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	68 "decl %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	69 "1: \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	70 "movd (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	71 "add %4, %1 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	72 "movd (%1), %%mm1 \n\t"
2611 00f608ae3791 read 32bit instead of 64bit to avoid overreading and missalignments michael parents: 2293 diff changeset	73 "movd (%2), %%mm2 \n\t"
00f608ae3791 read 32bit instead of 64bit to avoid overreading and missalignments michael parents: 2293 diff changeset	74 "movd 4(%2), %%mm3 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	75 "add %4, %1 \n\t"
2611 00f608ae3791 read 32bit instead of 64bit to avoid overreading and missalignments michael parents: 2293 diff changeset	76 PAVGB" %%mm2, %%mm0 \n\t"
00f608ae3791 read 32bit instead of 64bit to avoid overreading and missalignments michael parents: 2293 diff changeset	77 PAVGB" %%mm3, %%mm1 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	78 "movd %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	79 "add %5, %3 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	80 "movd %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	81 "add %5, %3 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	82 "movd (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	83 "add %4, %1 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	84 "movd (%1), %%mm1 \n\t"
2611 00f608ae3791 read 32bit instead of 64bit to avoid overreading and missalignments michael parents: 2293 diff changeset	85 "movd 8(%2), %%mm2 \n\t"
00f608ae3791 read 32bit instead of 64bit to avoid overreading and missalignments michael parents: 2293 diff changeset	86 "movd 12(%2), %%mm3 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	87 "add %4, %1 \n\t"
2611 00f608ae3791 read 32bit instead of 64bit to avoid overreading and missalignments michael parents: 2293 diff changeset	88 PAVGB" %%mm2, %%mm0 \n\t"
00f608ae3791 read 32bit instead of 64bit to avoid overreading and missalignments michael parents: 2293 diff changeset	89 PAVGB" %%mm3, %%mm1 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	90 "movd %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	91 "add %5, %3 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	92 "movd %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	93 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	94 "add $16, %2 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	95 "subl $4, %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	96 "jnz 1b \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	97 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	98 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	99 #else
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	100 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	101 #endif
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	102 :"S"((long)src1Stride), "D"((long)dstStride)
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	103 :"memory");
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	104 }
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	105
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	106
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	107 static void DEF(put_pixels8_l2)(uint8_t dst, uint8_t src1, uint8_t *src2, int dstStride, int src1Stride, int h)
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	108 {
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	109 __asm __volatile(
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	110 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	111 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	112 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	113 "movq (%2), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	114 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	115 "add $8, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	116 PAVGB" %%mm1, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	117 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	118 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	119 "decl %0 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	120 "1: \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	121 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	122 "add %4, %1 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	123 "movq (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	124 "add %4, %1 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	125 PAVGB" (%2), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	126 PAVGB" 8(%2), %%mm1 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	127 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	128 "add %5, %3 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	129 "movq %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	130 "add %5, %3 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	131 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	132 "add %4, %1 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	133 "movq (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	134 "add %4, %1 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	135 PAVGB" 16(%2), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	136 PAVGB" 24(%2), %%mm1 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	137 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	138 "add %5, %3 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	139 "movq %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	140 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	141 "add $32, %2 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	142 "subl $4, %0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	143 "jnz 1b \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	144 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	145 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	146 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	147 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	148 #endif
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	149 :"S"((long)src1Stride), "D"((long)dstStride)
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	150 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	151 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	152 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	153 :"r"(src1Stride), "r"(dstStride)
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	154 :"memory");*/
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	155 }
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	156
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	157 static void DEF(put_no_rnd_pixels8_l2)(uint8_t dst, uint8_t src1, uint8_t *src2, int dstStride, int src1Stride, int h)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	158 {
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	159 __asm __volatile(
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	160 "pcmpeqb %%mm6, %%mm6 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	161 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	162 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	163 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	164 "movq (%2), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	165 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	166 "add $8, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	167 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	168 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	169 PAVGB" %%mm1, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	170 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	171 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	172 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	173 "decl %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	174 "1: \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	175 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	176 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	177 "movq (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	178 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	179 "movq (%2), %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	180 "movq 8(%2), %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	181 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	182 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	183 "pxor %%mm6, %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	184 "pxor %%mm6, %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	185 PAVGB" %%mm2, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	186 PAVGB" %%mm3, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	187 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	188 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	189 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	190 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	191 "movq %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	192 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	193 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	194 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	195 "movq (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	196 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	197 "movq 16(%2), %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	198 "movq 24(%2), %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	199 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	200 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	201 "pxor %%mm6, %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	202 "pxor %%mm6, %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	203 PAVGB" %%mm2, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	204 PAVGB" %%mm3, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	205 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	206 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	207 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	208 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	209 "movq %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	210 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	211 "add $32, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	212 "subl $4, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	213 "jnz 1b \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	214 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	215 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	216 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	217 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	218 #endif
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	219 :"S"((long)src1Stride), "D"((long)dstStride)
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	220 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	221 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	222 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	223 :"r"(src1Stride), "r"(dstStride)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	224 :"memory");*/
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	225 }
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	226
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	227 static void DEF(avg_pixels4_l2)(uint8_t dst, uint8_t src1, uint8_t *src2, int dstStride, int src1Stride, int h)
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	228 {
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	229 __asm __volatile(
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	230 "testl $1, %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	231 " jz 1f \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	232 "movd (%1), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	233 "movd (%2), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	234 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	235 "add $4, %2 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	236 PAVGB" %%mm1, %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	237 PAVGB" (%3), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	238 "movd %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	239 "add %5, %3 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	240 "decl %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	241 "1: \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	242 "movd (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	243 "add %4, %1 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	244 "movd (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	245 "add %4, %1 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	246 PAVGB" (%2), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	247 PAVGB" 4(%2), %%mm1 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	248 PAVGB" (%3), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	249 "movd %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	250 "add %5, %3 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	251 PAVGB" (%3), %%mm1 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	252 "movd %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	253 "add %5, %3 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	254 "movd (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	255 "add %4, %1 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	256 "movd (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	257 "add %4, %1 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	258 PAVGB" 8(%2), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	259 PAVGB" 12(%2), %%mm1 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	260 PAVGB" (%3), %%mm0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	261 "movd %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	262 "add %5, %3 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	263 PAVGB" (%3), %%mm1 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	264 "movd %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	265 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	266 "add $16, %2 \n\t"
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	267 "subl $4, %0 \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	268 "jnz 1b \n\t"
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	269 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	270 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	271 #else
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	272 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	273 #endif
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	274 :"S"((long)src1Stride), "D"((long)dstStride)
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	275 :"memory");
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	276 }
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	277
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	278
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	279 static void DEF(avg_pixels8_l2)(uint8_t dst, uint8_t src1, uint8_t *src2, int dstStride, int src1Stride, int h)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	280 {
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	281 __asm __volatile(
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	282 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	283 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	284 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	285 "movq (%2), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	286 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	287 "add $8, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	288 PAVGB" %%mm1, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	289 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	290 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	291 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	292 "decl %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	293 "1: \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	294 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	295 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	296 "movq (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	297 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	298 PAVGB" (%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	299 PAVGB" 8(%2), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	300 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	301 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	302 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	303 PAVGB" (%3), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	304 "movq %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	305 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	306 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	307 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	308 "movq (%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	309 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	310 PAVGB" 16(%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	311 PAVGB" 24(%2), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	312 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	313 "movq %%mm0, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	314 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	315 PAVGB" (%3), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	316 "movq %%mm1, (%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	317 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	318 "add $32, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	319 "subl $4, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	320 "jnz 1b \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	321 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	322 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	323 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	324 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	325 #endif
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	326 :"S"((long)src1Stride), "D"((long)dstStride)
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	327 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	328 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	329 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	330 :"r"(src1Stride), "r"(dstStride)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	331 :"memory");*/
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	332 }
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	333
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	334 static void DEF(put_pixels16_x2)(uint8_t block, const uint8_t pixels, int line_size, int h)
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	335 {
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	336 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	337 "lea (%3, %3), %%"REG_a" \n\t"
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	338 "1: \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	339 "movq (%1), %%mm0 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	340 "movq (%1, %3), %%mm1 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	341 "movq 8(%1), %%mm2 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	342 "movq 8(%1, %3), %%mm3 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	343 PAVGB" 1(%1), %%mm0 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	344 PAVGB" 1(%1, %3), %%mm1 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	345 PAVGB" 9(%1), %%mm2 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	346 PAVGB" 9(%1, %3), %%mm3 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	347 "movq %%mm0, (%2) \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	348 "movq %%mm1, (%2, %3) \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	349 "movq %%mm2, 8(%2) \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	350 "movq %%mm3, 8(%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	351 "add %%"REG_a", %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	352 "add %%"REG_a", %2 \n\t"
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	353 "movq (%1), %%mm0 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	354 "movq (%1, %3), %%mm1 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	355 "movq 8(%1), %%mm2 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	356 "movq 8(%1, %3), %%mm3 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	357 PAVGB" 1(%1), %%mm0 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	358 PAVGB" 1(%1, %3), %%mm1 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	359 PAVGB" 9(%1), %%mm2 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	360 PAVGB" 9(%1, %3), %%mm3 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	361 "add %%"REG_a", %1 \n\t"
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	362 "movq %%mm0, (%2) \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	363 "movq %%mm1, (%2, %3) \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	364 "movq %%mm2, 8(%2) \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	365 "movq %%mm3, 8(%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	366 "add %%"REG_a", %2 \n\t"
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	367 "subl $4, %0 \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	368 "jnz 1b \n\t"
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	369 :"+g"(h), "+S"(pixels), "+D"(block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	370 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	371 :"%"REG_a, "memory");
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	372 }
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	373
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	374 static void DEF(put_pixels16_l2)(uint8_t dst, uint8_t src1, uint8_t *src2, int dstStride, int src1Stride, int h)
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	375 {
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	376 __asm __volatile(
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	377 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	378 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	379 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	380 "movq 8(%1), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	381 PAVGB" (%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	382 PAVGB" 8(%2), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	383 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	384 "add $16, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	385 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	386 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	387 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	388 "decl %0 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	389 "1: \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	390 "movq (%1), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	391 "movq 8(%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	392 "add %4, %1 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	393 PAVGB" (%2), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	394 PAVGB" 8(%2), %%mm1 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	395 "movq %%mm0, (%3) \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	396 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	397 "add %5, %3 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	398 "movq (%1), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	399 "movq 8(%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	400 "add %4, %1 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	401 PAVGB" 16(%2), %%mm0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	402 PAVGB" 24(%2), %%mm1 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	403 "movq %%mm0, (%3) \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	404 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	405 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	406 "add $32, %2 \n\t"
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	407 "subl $2, %0 \n\t"
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	408 "jnz 1b \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	409 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	410 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	411 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	412 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	413 #endif
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	414 :"S"((long)src1Stride), "D"((long)dstStride)
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	415 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	416 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	417 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	418 :"r"(src1Stride), "r"(dstStride)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	419 :"memory");*/
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	420 }
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	421
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	422 static void DEF(avg_pixels16_l2)(uint8_t dst, uint8_t src1, uint8_t *src2, int dstStride, int src1Stride, int h)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	423 {
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	424 __asm __volatile(
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	425 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	426 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	427 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	428 "movq 8(%1), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	429 PAVGB" (%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	430 PAVGB" 8(%2), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	431 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	432 "add $16, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	433 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	434 PAVGB" 8(%3), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	435 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	436 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	437 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	438 "decl %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	439 "1: \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	440 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	441 "movq 8(%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	442 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	443 PAVGB" (%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	444 PAVGB" 8(%2), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	445 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	446 PAVGB" 8(%3), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	447 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	448 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	449 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	450 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	451 "movq 8(%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	452 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	453 PAVGB" 16(%2), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	454 PAVGB" 24(%2), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	455 PAVGB" (%3), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	456 PAVGB" 8(%3), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	457 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	458 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	459 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	460 "add $32, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	461 "subl $2, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	462 "jnz 1b \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	463 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	464 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	465 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	466 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	467 #endif
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	468 :"S"((long)src1Stride), "D"((long)dstStride)
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	469 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	470 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	471 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	472 :"r"(src1Stride), "r"(dstStride)
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	473 :"memory");*/
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	474 }
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	475
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	476 static void DEF(put_no_rnd_pixels16_l2)(uint8_t dst, uint8_t src1, uint8_t *src2, int dstStride, int src1Stride, int h)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	477 {
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	478 __asm __volatile(
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	479 "pcmpeqb %%mm6, %%mm6\n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	480 "testl $1, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	481 " jz 1f \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	482 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	483 "movq 8(%1), %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	484 "movq (%2), %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	485 "movq 8(%2), %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	486 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	487 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	488 "pxor %%mm6, %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	489 "pxor %%mm6, %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	490 PAVGB" %%mm2, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	491 PAVGB" %%mm3, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	492 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	493 "pxor %%mm6, %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	494 "add %4, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	495 "add $16, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	496 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	497 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	498 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	499 "decl %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	500 "1: \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	501 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	502 "movq 8(%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	503 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	504 "movq (%2), %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	505 "movq 8(%2), %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	506 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	507 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	508 "pxor %%mm6, %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	509 "pxor %%mm6, %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	510 PAVGB" %%mm2, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	511 PAVGB" %%mm3, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	512 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	513 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	514 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	515 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	516 "add %5, %3 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	517 "movq (%1), %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	518 "movq 8(%1), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	519 "add %4, %1 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	520 "movq 16(%2), %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	521 "movq 24(%2), %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	522 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	523 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	524 "pxor %%mm6, %%mm2 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	525 "pxor %%mm6, %%mm3 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	526 PAVGB" %%mm2, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	527 PAVGB" %%mm3, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	528 "pxor %%mm6, %%mm0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	529 "pxor %%mm6, %%mm1 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	530 "movq %%mm0, (%3) \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	531 "movq %%mm1, 8(%3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	532 "add %5, %3 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	533 "add $32, %2 \n\t"
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	534 "subl $2, %0 \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	535 "jnz 1b \n\t"
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	536 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	537 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	538 #else
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	539 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	540 #endif
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	541 :"S"((long)src1Stride), "D"((long)dstStride)
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	542 :"memory");
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	543 //the following should be used, though better not with gcc ...
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	544 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	545 :"r"(src1Stride), "r"(dstStride)
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2024 diff changeset	546 :"memory");*/
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 651 diff changeset	547 }
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	548
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	549 /* GL: this function does incorrect rounding if overflow */
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	550 static void DEF(put_no_rnd_pixels8_x2)(uint8_t block, const uint8_t pixels, int line_size, int h)
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	551 {
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	552 MOVQ_BONE(mm6);
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	553 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	554 "lea (%3, %3), %%"REG_a" \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	555 "1: \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	556 "movq (%1), %%mm0 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	557 "movq (%1, %3), %%mm2 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	558 "movq 1(%1), %%mm1 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	559 "movq 1(%1, %3), %%mm3 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	560 "add %%"REG_a", %1 \n\t"
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	561 "psubusb %%mm6, %%mm0 \n\t"
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	562 "psubusb %%mm6, %%mm2 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	563 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	564 PAVGB" %%mm3, %%mm2 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	565 "movq %%mm0, (%2) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	566 "movq %%mm2, (%2, %3) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	567 "movq (%1), %%mm0 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	568 "movq 1(%1), %%mm1 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	569 "movq (%1, %3), %%mm2 \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	570 "movq 1(%1, %3), %%mm3 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	571 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	572 "add %%"REG_a", %1 \n\t"
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	573 "psubusb %%mm6, %%mm0 \n\t"
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	574 "psubusb %%mm6, %%mm2 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	575 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	576 PAVGB" %%mm3, %%mm2 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	577 "movq %%mm0, (%2) \n\t"
6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	578 "movq %%mm2, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	579 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	580 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	581 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	582 :"+g"(h), "+S"(pixels), "+D"(block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	583 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	584 :"%"REG_a, "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	585 }
986e461dc072 Initial revision glantau parents: diff changeset	586
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	587 static void DEF(put_pixels8_y2)(uint8_t block, const uint8_t pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	588 {
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	589 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	590 "lea (%3, %3), %%"REG_a" \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	591 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	592 "sub %3, %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	593 "1: \n\t"
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	594 "movq (%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	595 "movq (%1, %%"REG_a"), %%mm2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	596 "add %%"REG_a", %1 \n\t"
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	597 PAVGB" %%mm1, %%mm0 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	598 PAVGB" %%mm2, %%mm1 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	599 "movq %%mm0, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	600 "movq %%mm1, (%2, %%"REG_a") \n\t"
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	601 "movq (%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	602 "movq (%1, %%"REG_a"), %%mm0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	603 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	604 "add %%"REG_a", %1 \n\t"
413 1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	605 PAVGB" %%mm1, %%mm2 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	606 PAVGB" %%mm0, %%mm1 \n\t"
1548abb7bbed * fix for -fPIC compilation - compiles with 2.95.2 as well - any kabi parents: 402 diff changeset	607 "movq %%mm2, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	608 "movq %%mm1, (%2, %%"REG_a") \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	609 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	610 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	611 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	612 :"+g"(h), "+S"(pixels), "+D" (block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	613 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	614 :"%"REG_a, "memory");
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	615 }
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	616
389 f874493a1970 tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests) glantau parents: 387 diff changeset	617 /* GL: this function does incorrect rounding if overflow */
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	618 static void DEF(put_no_rnd_pixels8_y2)(uint8_t block, const uint8_t pixels, int line_size, int h)
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	619 {
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	620 MOVQ_BONE(mm6);
0 986e461dc072 Initial revision glantau parents: diff changeset	621 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	622 "lea (%3, %3), %%"REG_a" \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	623 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	624 "sub %3, %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	625 "1: \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	626 "movq (%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	627 "movq (%1, %%"REG_a"), %%mm2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	628 "add %%"REG_a", %1 \n\t"
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	629 "psubusb %%mm6, %%mm1 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	630 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	631 PAVGB" %%mm2, %%mm1 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	632 "movq %%mm0, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	633 "movq %%mm1, (%2, %%"REG_a") \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	634 "movq (%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	635 "movq (%1, %%"REG_a"), %%mm0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	636 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	637 "add %%"REG_a", %1 \n\t"
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	638 "psubusb %%mm6, %%mm1 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	639 PAVGB" %%mm1, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	640 PAVGB" %%mm0, %%mm1 \n\t"
439 6ae275655a23 * more PIC friendly and faster code kabi parents: 429 diff changeset	641 "movq %%mm2, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	642 "movq %%mm1, (%2, %%"REG_a") \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	643 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	644 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	645 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	646 :"+g"(h), "+S"(pixels), "+D" (block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	647 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	648 :"%"REG_a, "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	649 }
986e461dc072 Initial revision glantau parents: diff changeset	650
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	651 static void DEF(avg_pixels8)(uint8_t block, const uint8_t pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	652 {
986e461dc072 Initial revision glantau parents: diff changeset	653 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	654 "lea (%3, %3), %%"REG_a" \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	655 "1: \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	656 "movq (%2), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	657 "movq (%2, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	658 PAVGB" (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	659 PAVGB" (%1, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	660 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	661 "movq %%mm1, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	662 "add %%"REG_a", %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	663 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	664 "movq (%2), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	665 "movq (%2, %3), %%mm1 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	666 PAVGB" (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	667 PAVGB" (%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	668 "add %%"REG_a", %1 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	669 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	670 "movq %%mm1, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	671 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	672 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	673 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	674 :"+g"(h), "+S"(pixels), "+D"(block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	675 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	676 :"%"REG_a, "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	677 }
986e461dc072 Initial revision glantau parents: diff changeset	678
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	679 static void DEF(avg_pixels8_x2)(uint8_t block, const uint8_t pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	680 {
986e461dc072 Initial revision glantau parents: diff changeset	681 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	682 "lea (%3, %3), %%"REG_a" \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	683 "1: \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	684 "movq (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	685 "movq (%1, %3), %%mm2 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	686 PAVGB" 1(%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	687 PAVGB" 1(%1, %3), %%mm2 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	688 PAVGB" (%2), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	689 PAVGB" (%2, %3), %%mm2 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	690 "add %%"REG_a", %1 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	691 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	692 "movq %%mm2, (%2, %3) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	693 "movq (%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	694 "movq (%1, %3), %%mm2 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	695 PAVGB" 1(%1), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	696 PAVGB" 1(%1, %3), %%mm2 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	697 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	698 "add %%"REG_a", %1 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	699 PAVGB" (%2), %%mm0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	700 PAVGB" (%2, %3), %%mm2 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	701 "movq %%mm0, (%2) \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	702 "movq %%mm2, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	703 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	704 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	705 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	706 :"+g"(h), "+S"(pixels), "+D"(block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	707 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	708 :"%"REG_a, "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	709 }
986e461dc072 Initial revision glantau parents: diff changeset	710
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	711 static void DEF(avg_pixels8_y2)(uint8_t block, const uint8_t pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	712 {
986e461dc072 Initial revision glantau parents: diff changeset	713 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	714 "lea (%3, %3), %%"REG_a" \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	715 "movq (%1), %%mm0 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	716 "sub %3, %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	717 "1: \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	718 "movq (%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	719 "movq (%1, %%"REG_a"), %%mm2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	720 "add %%"REG_a", %1 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	721 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	722 PAVGB" %%mm2, %%mm1 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	723 "movq (%2, %3), %%mm3 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	724 "movq (%2, %%"REG_a"), %%mm4 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	725 PAVGB" %%mm3, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	726 PAVGB" %%mm4, %%mm1 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	727 "movq %%mm0, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	728 "movq %%mm1, (%2, %%"REG_a") \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	729 "movq (%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	730 "movq (%1, %%"REG_a"), %%mm0 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	731 PAVGB" %%mm1, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	732 PAVGB" %%mm0, %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	733 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	734 "add %%"REG_a", %1 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	735 "movq (%2, %3), %%mm3 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	736 "movq (%2, %%"REG_a"), %%mm4 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	737 PAVGB" %%mm3, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	738 PAVGB" %%mm4, %%mm1 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	739 "movq %%mm2, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	740 "movq %%mm1, (%2, %%"REG_a") \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	741 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	742 "subl $4, %0 \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	743 "jnz 1b \n\t"
c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	744 :"+g"(h), "+S"(pixels), "+D"(block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	745 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	746 :"%"REG_a, "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	747 }
986e461dc072 Initial revision glantau parents: diff changeset	748
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	749 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	750 static void DEF(avg_pixels8_xy2)(uint8_t block, const uint8_t pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	751 {
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	752 MOVQ_BONE(mm6);
0 986e461dc072 Initial revision glantau parents: diff changeset	753 __asm __volatile(
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	754 "lea (%3, %3), %%"REG_a" \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	755 "movq (%1), %%mm0 \n\t"
442 006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	756 PAVGB" 1(%1), %%mm0 \n\t"
006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	757 ".balign 8 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	758 "1: \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	759 "movq (%1, %%"REG_a"), %%mm2 \n\t"
442 006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	760 "movq (%1, %3), %%mm1 \n\t"
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 443 diff changeset	761 "psubusb %%mm6, %%mm2 \n\t"
442 006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	762 PAVGB" 1(%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	763 PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	764 "add %%"REG_a", %1 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	765 PAVGB" %%mm1, %%mm0 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	766 PAVGB" %%mm2, %%mm1 \n\t"
442 006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	767 PAVGB" (%2), %%mm0 \n\t"
006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	768 PAVGB" (%2, %3), %%mm1 \n\t"
006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	769 "movq %%mm0, (%2) \n\t"
006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	770 "movq %%mm1, (%2, %3) \n\t"
006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	771 "movq (%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	772 "movq (%1, %%"REG_a"), %%mm0 \n\t"
442 006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	773 PAVGB" 1(%1, %3), %%mm1 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	774 PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	775 "add %%"REG_a", %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	776 "add %%"REG_a", %1 \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	777 PAVGB" %%mm1, %%mm2 \n\t"
f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 151 diff changeset	778 PAVGB" %%mm0, %%mm1 \n\t"
442 006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	779 PAVGB" (%2), %%mm2 \n\t"
006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	780 PAVGB" (%2, %3), %%mm1 \n\t"
006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	781 "movq %%mm2, (%2) \n\t"
006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	782 "movq %%mm1, (%2, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	783 "add %%"REG_a", %2 \n\t"
441 c0de4d3c7d3c * optimized avg_* functions (except xy2) kabi parents: 439 diff changeset	784 "subl $4, %0 \n\t"
442 006965950f49 * optimized remaing avg_pixels_xy2 kabi parents: 441 diff changeset	785 "jnz 1b \n\t"
443 63467327c06c * cosmetic minor change kabi parents: 442 diff changeset	786 :"+g"(h), "+S"(pixels), "+D"(block)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	787 :"r" ((long)line_size)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2209 diff changeset	788 :"%"REG_a, "memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	789 }
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	790
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	791 //FIXME the following could be optimized too ...
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	792 static void DEF(put_no_rnd_pixels16_x2)(uint8_t block, const uint8_t pixels, int line_size, int h){
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	793 DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	794 DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	795 }
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	796 static void DEF(put_pixels16_y2)(uint8_t block, const uint8_t pixels, int line_size, int h){
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	797 DEF(put_pixels8_y2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	798 DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	799 }
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	800 static void DEF(put_no_rnd_pixels16_y2)(uint8_t block, const uint8_t pixels, int line_size, int h){
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	801 DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	802 DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	803 }
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	804 static void DEF(avg_pixels16)(uint8_t block, const uint8_t pixels, int line_size, int h){
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	805 DEF(avg_pixels8)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	806 DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	807 }
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	808 static void DEF(avg_pixels16_x2)(uint8_t block, const uint8_t pixels, int line_size, int h){
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	809 DEF(avg_pixels8_x2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	810 DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	811 }
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	812 static void DEF(avg_pixels16_y2)(uint8_t block, const uint8_t pixels, int line_size, int h){
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	813 DEF(avg_pixels8_y2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	814 DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	815 }
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 954 diff changeset	816 static void DEF(avg_pixels16_xy2)(uint8_t block, const uint8_t pixels, int line_size, int h){
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	817 DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	818 DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	819 }
45e8f39fda50 put/avg_pixels16 michaelni parents: 448 diff changeset	820

Mercurial > libavcodec.hg

annotate i386/dsputil_mmx_avg.h @ 2662:2fe9599170f6 libavcodec