libavcodec.hg: i386/mpegvideo

annotate i386/mpegvideo_mmx.c @ 2422:18b8b2dcc037 libavcodec

various security fixes and precautionary checks

author	michael
date	Wed, 12 Jan 2005 00:16:25 +0000
parents	15cfba1b97b5
children	ef2149182f1c

rev	line source
8 1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	1 /*
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	2 * The simplest mpeg encoder (well, it was the simplest!)
429 718a22dc121f license/copyright change glantau parents: 350 diff changeset	3 * Copyright (c) 2000,2001 Fabrice Bellard.
8 1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	4 *
429 718a22dc121f license/copyright change glantau parents: 350 diff changeset	5 * This library is free software; you can redistribute it and/or
718a22dc121f license/copyright change glantau parents: 350 diff changeset	6 * modify it under the terms of the GNU Lesser General Public
718a22dc121f license/copyright change glantau parents: 350 diff changeset	7 * License as published by the Free Software Foundation; either
718a22dc121f license/copyright change glantau parents: 350 diff changeset	8 * version 2 of the License, or (at your option) any later version.
8 1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	9 *
429 718a22dc121f license/copyright change glantau parents: 350 diff changeset	10 * This library is distributed in the hope that it will be useful,
8 1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
429 718a22dc121f license/copyright change glantau parents: 350 diff changeset	12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
718a22dc121f license/copyright change glantau parents: 350 diff changeset	13 * Lesser General Public License for more details.
8 1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	14 *
429 718a22dc121f license/copyright change glantau parents: 350 diff changeset	15 * You should have received a copy of the GNU Lesser General Public
718a22dc121f license/copyright change glantau parents: 350 diff changeset	16 * License along with this library; if not, write to the Free Software
718a22dc121f license/copyright change glantau parents: 350 diff changeset	17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
8 1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	18 *
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	19 * Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru>
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	20 * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
8 1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	21 */
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	22
14 8ceb13af9cb6 renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff glantau parents: 8 diff changeset	23 #include "../dsputil.h"
8ceb13af9cb6 renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff glantau parents: 8 diff changeset	24 #include "../mpegvideo.h"
220 0b234715e205 (commit by michael) arpi_esp parents: 206 diff changeset	25 #include "../avcodec.h"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	26 #include "mmx.h"
14 8ceb13af9cb6 renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff glantau parents: 8 diff changeset	27
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 949 diff changeset	28 extern uint8_t zigzag_direct_noperm[64];
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 949 diff changeset	29 extern uint16_t inv_zigzag_direct16[64];
200 6ab301aaa652 (commit by michael) arpi_esp parents: 153 diff changeset	30
8 1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	31 static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL;
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	32 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	33
145 bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	34
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	35 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
145 bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	36 DCTELEM *block, int n, int qscale)
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	37 {
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	38 long level, qmul, qadd, nCoeffs;
145 bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	39
706 e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	40 qmul = qscale << 1;
e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	41
1661 4c9fd29f1606 h263 slice structured mode michael parents: 1597 diff changeset	42 assert(s->block_last_index[n]>=0 \|\| s->h263_aic);
706 e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	43
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	44 if (!s->h263_aic) {
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	45 if (n < 4)
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	46 level = block[0] * s->y_dc_scale;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	47 else
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	48 level = block[0] * s->c_dc_scale;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	49 qadd = (qscale - 1) \| 1;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	50 }else{
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	51 qadd = 0;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	52 level= block[0];
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	53 }
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	54 if(s->ac_pred)
706 e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	55 nCoeffs=63;
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	56 else
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	57 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
200 6ab301aaa652 (commit by michael) arpi_esp parents: 153 diff changeset	58 //printf("%d %d ", qmul, qadd);
145 bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	59 asm volatile(
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	60 "movd %1, %%mm6 \n\t" //qmul
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	61 "packssdw %%mm6, %%mm6 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	62 "packssdw %%mm6, %%mm6 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	63 "movd %2, %%mm5 \n\t" //qadd
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	64 "pxor %%mm7, %%mm7 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	65 "packssdw %%mm5, %%mm5 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	66 "packssdw %%mm5, %%mm5 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	67 "psubw %%mm5, %%mm7 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	68 "pxor %%mm4, %%mm4 \n\t"
153 acbd3bc999b3 Let loops will be aligned nickols_k parents: 145 diff changeset	69 ".balign 16\n\t"
145 bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	70 "1: \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	71 "movq (%0, %3), %%mm0 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	72 "movq 8(%0, %3), %%mm1 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	73
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	74 "pmullw %%mm6, %%mm0 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	75 "pmullw %%mm6, %%mm1 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	76
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	77 "movq (%0, %3), %%mm2 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	78 "movq 8(%0, %3), %%mm3 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	79
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	80 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	81 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	82
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	83 "pxor %%mm2, %%mm0 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	84 "pxor %%mm3, %%mm1 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	85
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	86 "paddw %%mm7, %%mm0 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	87 "paddw %%mm7, %%mm1 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	88
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	89 "pxor %%mm0, %%mm2 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	90 "pxor %%mm1, %%mm3 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	91
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	92 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	93 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	94
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	95 "pandn %%mm2, %%mm0 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	96 "pandn %%mm3, %%mm1 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	97
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	98 "movq %%mm0, (%0, %3) \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	99 "movq %%mm1, 8(%0, %3) \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	100
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	101 "add $16, %3 \n\t"
706 e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	102 "jng 1b \n\t"
e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	103 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
145 bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	104 : "memory"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	105 );
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	106 block[0]= level;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	107 }
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	108
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	109
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	110 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	111 DCTELEM *block, int n, int qscale)
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	112 {
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	113 long qmul, qadd, nCoeffs;
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	114
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	115 qmul = qscale << 1;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	116 qadd = (qscale - 1) \| 1;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	117
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	118 assert(s->block_last_index[n]>=0 \|\| s->h263_aic);
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	119
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	120 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	121 //printf("%d %d ", qmul, qadd);
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	122 asm volatile(
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	123 "movd %1, %%mm6 \n\t" //qmul
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	124 "packssdw %%mm6, %%mm6 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	125 "packssdw %%mm6, %%mm6 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	126 "movd %2, %%mm5 \n\t" //qadd
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	127 "pxor %%mm7, %%mm7 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	128 "packssdw %%mm5, %%mm5 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	129 "packssdw %%mm5, %%mm5 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	130 "psubw %%mm5, %%mm7 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	131 "pxor %%mm4, %%mm4 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	132 ".balign 16\n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	133 "1: \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	134 "movq (%0, %3), %%mm0 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	135 "movq 8(%0, %3), %%mm1 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	136
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	137 "pmullw %%mm6, %%mm0 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	138 "pmullw %%mm6, %%mm1 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	139
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	140 "movq (%0, %3), %%mm2 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	141 "movq 8(%0, %3), %%mm3 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	142
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	143 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	144 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	145
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	146 "pxor %%mm2, %%mm0 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	147 "pxor %%mm3, %%mm1 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	148
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	149 "paddw %%mm7, %%mm0 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	150 "paddw %%mm7, %%mm1 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	151
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	152 "pxor %%mm0, %%mm2 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	153 "pxor %%mm1, %%mm3 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	154
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	155 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	156 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	157
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	158 "pandn %%mm2, %%mm0 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	159 "pandn %%mm3, %%mm1 \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	160
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	161 "movq %%mm0, (%0, %3) \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	162 "movq %%mm1, 8(%0, %3) \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	163
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	164 "add $16, %3 \n\t"
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	165 "jng 1b \n\t"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	166 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	167 : "memory"
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	168 );
145 bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	169 }
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	170
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	171
8 1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	172 /*
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	173 NK:
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	174 Note: looking at PARANOID:
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	175 "enable all paranoid tests for rounding, overflows, etc..."
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	176
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	177 #ifdef PARANOID
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	178 if (level < -2048 \|\| level > 2047)
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	179 fprintf(stderr, "unquant error %d %d\n", i, level);
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	180 #endif
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	181 We can suppose that result of two multiplications can't be greate of 0xFFFF
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	182 i.e. is 16-bit, so we use here only PMULLW instruction and can avoid
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	183 a complex multiplication.
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	184 =====================================================
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	185 Full formula for multiplication of 2 integer numbers
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	186 which are represent as high:low words:
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	187 input: value1 = high1:low1
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	188 value2 = high2:low2
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	189 output: value3 = value1*value2
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	190 value3=high3:low3 (on overflow: modulus 2^32 wrap-around)
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	191 this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	192 but this algorithm will compute only 0x66cb0ce4
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	193 this limited by 16-bit size of operands
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	194 ---------------------------------
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	195 tlow1 = high1*low2
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	196 tlow2 = high2*low1
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	197 tlow1 = tlow1 + tlow2
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	198 high3:low3 = low1*low2
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	199 high3 += tlow1
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	200 */
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	201 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
14 8ceb13af9cb6 renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff glantau parents: 8 diff changeset	202 DCTELEM *block, int n, int qscale)
8 1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	203 {
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	204 long nCoeffs;
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 949 diff changeset	205 const uint16_t *quant_matrix;
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	206 int block0;
706 e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	207
e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	208 assert(s->block_last_index[n]>=0);
e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	209
e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	210 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
200 6ab301aaa652 (commit by michael) arpi_esp parents: 153 diff changeset	211
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	212 if (n < 4)
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	213 block0 = block[0] * s->y_dc_scale;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	214 else
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	215 block0 = block[0] * s->c_dc_scale;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	216 /* XXX: only mpeg1 */
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	217 quant_matrix = s->intra_matrix;
145 bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	218 asm volatile(
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	219 "pcmpeqw %%mm7, %%mm7 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	220 "psrlw $15, %%mm7 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	221 "movd %2, %%mm6 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	222 "packssdw %%mm6, %%mm6 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	223 "packssdw %%mm6, %%mm6 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	224 "mov %3, %%"REG_a" \n\t"
153 acbd3bc999b3 Let loops will be aligned nickols_k parents: 145 diff changeset	225 ".balign 16\n\t"
145 bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	226 "1: \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	227 "movq (%0, %%"REG_a"), %%mm0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	228 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	229 "movq (%1, %%"REG_a"), %%mm4 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	230 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	231 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	232 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	233 "pxor %%mm2, %%mm2 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	234 "pxor %%mm3, %%mm3 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	235 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	236 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	237 "pxor %%mm2, %%mm0 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	238 "pxor %%mm3, %%mm1 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	239 "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	240 "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	241 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	242 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	243 "pxor %%mm4, %%mm4 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	244 "pxor %%mm5, %%mm5 \n\t" // FIXME slow
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	245 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	246 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	247 "psraw $3, %%mm0 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	248 "psraw $3, %%mm1 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	249 "psubw %%mm7, %%mm0 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	250 "psubw %%mm7, %%mm1 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	251 "por %%mm7, %%mm0 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	252 "por %%mm7, %%mm1 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	253 "pxor %%mm2, %%mm0 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	254 "pxor %%mm3, %%mm1 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	255 "psubw %%mm2, %%mm0 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	256 "psubw %%mm3, %%mm1 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	257 "pandn %%mm0, %%mm4 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	258 "pandn %%mm1, %%mm5 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	259 "movq %%mm4, (%0, %%"REG_a") \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	260 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	261
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	262 "add $16, %%"REG_a" \n\t"
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	263 "js 1b \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	264 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	265 : "%"REG_a, "memory"
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	266 );
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	267 block[0]= block0;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	268 }
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	269
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	270 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	271 DCTELEM *block, int n, int qscale)
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	272 {
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	273 long nCoeffs;
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	274 const uint16_t *quant_matrix;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	275
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	276 assert(s->block_last_index[n]>=0);
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	277
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	278 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	279
344 9f6071a87e17 fixed msmpeg4 infinite loop if buggy stream michaelni parents: 325 diff changeset	280 quant_matrix = s->inter_matrix;
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	281 asm volatile(
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	282 "pcmpeqw %%mm7, %%mm7 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	283 "psrlw $15, %%mm7 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	284 "movd %2, %%mm6 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	285 "packssdw %%mm6, %%mm6 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	286 "packssdw %%mm6, %%mm6 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	287 "mov %3, %%"REG_a" \n\t"
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	288 ".balign 16\n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	289 "1: \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	290 "movq (%0, %%"REG_a"), %%mm0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	291 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	292 "movq (%1, %%"REG_a"), %%mm4 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	293 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
145 bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	294 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	295 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	296 "pxor %%mm2, %%mm2 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	297 "pxor %%mm3, %%mm3 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	298 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	299 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	300 "pxor %%mm2, %%mm0 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	301 "pxor %%mm3, %%mm1 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	302 "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	303 "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	304 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	305 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	306 "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	307 "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	308 "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])2 + 1)q
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	309 "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])2 + 1)q
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	310 "pxor %%mm4, %%mm4 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	311 "pxor %%mm5, %%mm5 \n\t" // FIXME slow
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	312 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	313 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
145 bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	314 "psraw $4, %%mm0 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	315 "psraw $4, %%mm1 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	316 "psubw %%mm7, %%mm0 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	317 "psubw %%mm7, %%mm1 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	318 "por %%mm7, %%mm0 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	319 "por %%mm7, %%mm1 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	320 "pxor %%mm2, %%mm0 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	321 "pxor %%mm3, %%mm1 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	322 "psubw %%mm2, %%mm0 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	323 "psubw %%mm3, %%mm1 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	324 "pandn %%mm0, %%mm4 \n\t"
bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	325 "pandn %%mm1, %%mm5 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	326 "movq %%mm4, (%0, %%"REG_a") \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	327 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	328
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	329 "add $16, %%"REG_a" \n\t"
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	330 "js 1b \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	331 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	332 : "%"REG_a, "memory"
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	333 );
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	334 }
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	335
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	336 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	337 DCTELEM *block, int n, int qscale)
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	338 {
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	339 long nCoeffs;
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 949 diff changeset	340 const uint16_t *quant_matrix;
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	341 int block0;
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	342
706 e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	343 assert(s->block_last_index[n]>=0);
e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	344
e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	345 if(s->alternate_scan) nCoeffs= 63; //FIXME
e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	346 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	347
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	348 if (n < 4)
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	349 block0 = block[0] * s->y_dc_scale;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	350 else
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	351 block0 = block[0] * s->c_dc_scale;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	352 quant_matrix = s->intra_matrix;
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	353 asm volatile(
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	354 "pcmpeqw %%mm7, %%mm7 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	355 "psrlw $15, %%mm7 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	356 "movd %2, %%mm6 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	357 "packssdw %%mm6, %%mm6 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	358 "packssdw %%mm6, %%mm6 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	359 "mov %3, %%"REG_a" \n\t"
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	360 ".balign 16\n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	361 "1: \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	362 "movq (%0, %%"REG_a"), %%mm0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	363 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	364 "movq (%1, %%"REG_a"), %%mm4 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	365 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	366 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	367 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	368 "pxor %%mm2, %%mm2 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	369 "pxor %%mm3, %%mm3 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	370 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	371 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	372 "pxor %%mm2, %%mm0 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	373 "pxor %%mm3, %%mm1 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	374 "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	375 "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	376 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	377 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	378 "pxor %%mm4, %%mm4 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	379 "pxor %%mm5, %%mm5 \n\t" // FIXME slow
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	380 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	381 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	382 "psraw $3, %%mm0 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	383 "psraw $3, %%mm1 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	384 "pxor %%mm2, %%mm0 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	385 "pxor %%mm3, %%mm1 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	386 "psubw %%mm2, %%mm0 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	387 "psubw %%mm3, %%mm1 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	388 "pandn %%mm0, %%mm4 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	389 "pandn %%mm1, %%mm5 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	390 "movq %%mm4, (%0, %%"REG_a") \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	391 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
145 bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	392
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	393 "add $16, %%"REG_a" \n\t"
706 e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	394 "jng 1b \n\t"
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	395 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	396 : "%"REG_a, "memory"
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	397 );
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	398 block[0]= block0;
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	399 //Note, we dont do mismatch control for intra as errors cannot accumulate
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	400 }
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	401
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	402 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	403 DCTELEM *block, int n, int qscale)
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	404 {
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	405 long nCoeffs;
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	406 const uint16_t *quant_matrix;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	407
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	408 assert(s->block_last_index[n]>=0);
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	409
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	410 if(s->alternate_scan) nCoeffs= 63; //FIXME
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	411 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	412
344 9f6071a87e17 fixed msmpeg4 infinite loop if buggy stream michaelni parents: 325 diff changeset	413 quant_matrix = s->inter_matrix;
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	414 asm volatile(
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	415 "pcmpeqw %%mm7, %%mm7 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	416 "psrlq $48, %%mm7 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	417 "movd %2, %%mm6 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	418 "packssdw %%mm6, %%mm6 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	419 "packssdw %%mm6, %%mm6 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	420 "mov %3, %%"REG_a" \n\t"
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	421 ".balign 16\n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	422 "1: \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	423 "movq (%0, %%"REG_a"), %%mm0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	424 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	425 "movq (%1, %%"REG_a"), %%mm4 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	426 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	427 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	428 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	429 "pxor %%mm2, %%mm2 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	430 "pxor %%mm3, %%mm3 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	431 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	432 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	433 "pxor %%mm2, %%mm0 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	434 "pxor %%mm3, %%mm1 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	435 "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	436 "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	437 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	438 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	439 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])2q
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	440 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])2q
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	441 "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])2 + 1)q
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	442 "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])2 + 1)q
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	443 "pxor %%mm4, %%mm4 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	444 "pxor %%mm5, %%mm5 \n\t" // FIXME slow
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	445 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	446 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	447 "psrlw $4, %%mm0 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	448 "psrlw $4, %%mm1 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	449 "pxor %%mm2, %%mm0 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	450 "pxor %%mm3, %%mm1 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	451 "psubw %%mm2, %%mm0 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	452 "psubw %%mm3, %%mm1 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	453 "pandn %%mm0, %%mm4 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	454 "pandn %%mm1, %%mm5 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	455 "pxor %%mm4, %%mm7 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	456 "pxor %%mm5, %%mm7 \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	457 "movq %%mm4, (%0, %%"REG_a") \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	458 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	459
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	460 "add $16, %%"REG_a" \n\t"
706 e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	461 "jng 1b \n\t"
325 15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	462 "movd 124(%0, %3), %%mm0 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	463 "movq %%mm7, %%mm6 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	464 "psrlq $32, %%mm7 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	465 "pxor %%mm6, %%mm7 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	466 "movq %%mm7, %%mm6 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	467 "psrlq $16, %%mm7 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	468 "pxor %%mm6, %%mm7 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	469 "pslld $31, %%mm7 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	470 "psrlq $15, %%mm7 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	471 "pxor %%mm7, %%mm0 \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	472 "movd %%mm0, 124(%0, %3) \n\t"
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	473
15efd80cf51e mpeg2/mpeg4 dequantizer support (c & mmx) michaelni parents: 312 diff changeset	474 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	475 : "%"REG_a, "memory"
145 bd1adece8280 dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at> arpi_esp parents: 14 diff changeset	476 );
8 1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	477 }
1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	478
206 994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	479 /* draw the edges of width 'w' of an image of size width, height
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	480 this mmx version can only handle w==8 \|\| w==16 */
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 949 diff changeset	481 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
206 994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	482 {
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 949 diff changeset	483 uint8_t ptr, last_line;
206 994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	484 int i;
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	485
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	486 last_line = buf + (height - 1) * wrap;
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	487 /* left and right */
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	488 ptr = buf;
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	489 if(w==8)
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	490 {
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	491 asm volatile(
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	492 "1: \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	493 "movd (%0), %%mm0 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	494 "punpcklbw %%mm0, %%mm0 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	495 "punpcklwd %%mm0, %%mm0 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	496 "punpckldq %%mm0, %%mm0 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	497 "movq %%mm0, -8(%0) \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	498 "movq -8(%0, %2), %%mm1 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	499 "punpckhbw %%mm1, %%mm1 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	500 "punpckhwd %%mm1, %%mm1 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	501 "punpckhdq %%mm1, %%mm1 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	502 "movq %%mm1, (%0, %2) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	503 "add %1, %0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	504 "cmp %3, %0 \n\t"
206 994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	505 " jb 1b \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	506 : "+r" (ptr)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	507 : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
206 994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	508 );
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	509 }
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	510 else
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	511 {
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	512 asm volatile(
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	513 "1: \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	514 "movd (%0), %%mm0 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	515 "punpcklbw %%mm0, %%mm0 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	516 "punpcklwd %%mm0, %%mm0 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	517 "punpckldq %%mm0, %%mm0 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	518 "movq %%mm0, -8(%0) \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	519 "movq %%mm0, -16(%0) \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	520 "movq -8(%0, %2), %%mm1 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	521 "punpckhbw %%mm1, %%mm1 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	522 "punpckhwd %%mm1, %%mm1 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	523 "punpckhdq %%mm1, %%mm1 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	524 "movq %%mm1, (%0, %2) \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	525 "movq %%mm1, 8(%0, %2) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	526 "add %1, %0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	527 "cmp %3, %0 \n\t"
206 994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	528 " jb 1b \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	529 : "+r" (ptr)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	530 : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
206 994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	531 );
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	532 }
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	533
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	534 for(i=0;i<w;i+=4) {
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	535 /* top and bottom (and hopefully also the corners) */
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	536 ptr= buf - (i + 1) * wrap - w;
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	537 asm volatile(
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	538 "1: \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	539 "movq (%1, %0), %%mm0 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	540 "movq %%mm0, (%0) \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	541 "movq %%mm0, (%0, %2) \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	542 "movq %%mm0, (%0, %2, 2) \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	543 "movq %%mm0, (%0, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	544 "add $8, %0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	545 "cmp %4, %0 \n\t"
206 994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	546 " jb 1b \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	547 : "+r" (ptr)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	548 : "r" ((long)buf - (long)ptr - w), "r" ((long)-wrap), "r" ((long)-wrap3), "r" (ptr+width+2w)
206 994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	549 );
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	550 ptr= last_line + (i + 1) * wrap - w;
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	551 asm volatile(
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	552 "1: \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	553 "movq (%1, %0), %%mm0 \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	554 "movq %%mm0, (%0) \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	555 "movq %%mm0, (%0, %2) \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	556 "movq %%mm0, (%0, %2, 2) \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	557 "movq %%mm0, (%0, %3) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	558 "add $8, %0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	559 "cmp %4, %0 \n\t"
206 994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	560 " jb 1b \n\t"
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	561 : "+r" (ptr)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	562 : "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap3), "r" (ptr+width+2w)
206 994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	563 );
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	564 }
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	565 }
994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	566
1719 4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	567 static void denoise_dct_mmx(MpegEncContext s, DCTELEM block){
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	568 const int intra= s->mb_intra;
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	569 int *sum= s->dct_error_sum[intra];
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	570 uint16_t *offset= s->dct_offset[intra];
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	571
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	572 s->dct_count[intra]++;
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	573
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	574 asm volatile(
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	575 "pxor %%mm7, %%mm7 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	576 "1: \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	577 "pxor %%mm0, %%mm0 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	578 "pxor %%mm1, %%mm1 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	579 "movq (%0), %%mm2 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	580 "movq 8(%0), %%mm3 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	581 "pcmpgtw %%mm2, %%mm0 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	582 "pcmpgtw %%mm3, %%mm1 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	583 "pxor %%mm0, %%mm2 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	584 "pxor %%mm1, %%mm3 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	585 "psubw %%mm0, %%mm2 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	586 "psubw %%mm1, %%mm3 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	587 "movq %%mm2, %%mm4 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	588 "movq %%mm3, %%mm5 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	589 "psubusw (%2), %%mm2 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	590 "psubusw 8(%2), %%mm3 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	591 "pxor %%mm0, %%mm2 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	592 "pxor %%mm1, %%mm3 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	593 "psubw %%mm0, %%mm2 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	594 "psubw %%mm1, %%mm3 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	595 "movq %%mm2, (%0) \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	596 "movq %%mm3, 8(%0) \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	597 "movq %%mm4, %%mm2 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	598 "movq %%mm5, %%mm3 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	599 "punpcklwd %%mm7, %%mm4 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	600 "punpckhwd %%mm7, %%mm2 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	601 "punpcklwd %%mm7, %%mm5 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	602 "punpckhwd %%mm7, %%mm3 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	603 "paddd (%1), %%mm4 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	604 "paddd 8(%1), %%mm2 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	605 "paddd 16(%1), %%mm5 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	606 "paddd 24(%1), %%mm3 \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	607 "movq %%mm4, (%1) \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	608 "movq %%mm2, 8(%1) \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	609 "movq %%mm5, 16(%1) \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	610 "movq %%mm3, 24(%1) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	611 "add $16, %0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	612 "add $32, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	613 "add $16, %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	614 "cmp %3, %0 \n\t"
1719 4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	615 " jb 1b \n\t"
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	616 : "+r" (block), "+r" (sum), "+r" (offset)
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	617 : "r"(block+64)
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	618 );
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	619 }
4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	620
1720 96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	621 static void denoise_dct_sse2(MpegEncContext s, DCTELEM block){
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	622 const int intra= s->mb_intra;
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	623 int *sum= s->dct_error_sum[intra];
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	624 uint16_t *offset= s->dct_offset[intra];
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	625
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	626 s->dct_count[intra]++;
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	627
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	628 asm volatile(
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	629 "pxor %%xmm7, %%xmm7 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	630 "1: \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	631 "pxor %%xmm0, %%xmm0 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	632 "pxor %%xmm1, %%xmm1 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	633 "movdqa (%0), %%xmm2 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	634 "movdqa 16(%0), %%xmm3 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	635 "pcmpgtw %%xmm2, %%xmm0 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	636 "pcmpgtw %%xmm3, %%xmm1 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	637 "pxor %%xmm0, %%xmm2 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	638 "pxor %%xmm1, %%xmm3 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	639 "psubw %%xmm0, %%xmm2 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	640 "psubw %%xmm1, %%xmm3 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	641 "movdqa %%xmm2, %%xmm4 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	642 "movdqa %%xmm3, %%xmm5 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	643 "psubusw (%2), %%xmm2 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	644 "psubusw 16(%2), %%xmm3 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	645 "pxor %%xmm0, %%xmm2 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	646 "pxor %%xmm1, %%xmm3 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	647 "psubw %%xmm0, %%xmm2 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	648 "psubw %%xmm1, %%xmm3 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	649 "movdqa %%xmm2, (%0) \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	650 "movdqa %%xmm3, 16(%0) \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	651 "movdqa %%xmm4, %%xmm6 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	652 "movdqa %%xmm5, %%xmm0 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	653 "punpcklwd %%xmm7, %%xmm4 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	654 "punpckhwd %%xmm7, %%xmm6 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	655 "punpcklwd %%xmm7, %%xmm5 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	656 "punpckhwd %%xmm7, %%xmm0 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	657 "paddd (%1), %%xmm4 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	658 "paddd 16(%1), %%xmm6 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	659 "paddd 32(%1), %%xmm5 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	660 "paddd 48(%1), %%xmm0 \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	661 "movdqa %%xmm4, (%1) \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	662 "movdqa %%xmm6, 16(%1) \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	663 "movdqa %%xmm5, 32(%1) \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	664 "movdqa %%xmm0, 48(%1) \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	665 "add $32, %0 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	666 "add $64, %1 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	667 "add $32, %2 \n\t"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2024 diff changeset	668 "cmp %3, %0 \n\t"
1720 96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	669 " jb 1b \n\t"
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	670 : "+r" (block), "+r" (sum), "+r" (offset)
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	671 : "r"(block+64)
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	672 );
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	673 }
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	674
220 0b234715e205 (commit by michael) arpi_esp parents: 206 diff changeset	675 #undef HAVE_MMX2
0b234715e205 (commit by michael) arpi_esp parents: 206 diff changeset	676 #define RENAME(a) a ## _MMX
1565 1a9a63f59849 minor mmx2 optimization if the dct michael parents: 1261 diff changeset	677 #define RENAMEl(a) a ## _mmx
220 0b234715e205 (commit by michael) arpi_esp parents: 206 diff changeset	678 #include "mpegvideo_mmx_template.c"
0b234715e205 (commit by michael) arpi_esp parents: 206 diff changeset	679
0b234715e205 (commit by michael) arpi_esp parents: 206 diff changeset	680 #define HAVE_MMX2
0b234715e205 (commit by michael) arpi_esp parents: 206 diff changeset	681 #undef RENAME
1597 4c9165372ab3 noise reduction of dct coefficients michael parents: 1565 diff changeset	682 #undef RENAMEl
220 0b234715e205 (commit by michael) arpi_esp parents: 206 diff changeset	683 #define RENAME(a) a ## _MMX2
1565 1a9a63f59849 minor mmx2 optimization if the dct michael parents: 1261 diff changeset	684 #define RENAMEl(a) a ## _mmx2
220 0b234715e205 (commit by michael) arpi_esp parents: 206 diff changeset	685 #include "mpegvideo_mmx_template.c"
206 994aa8623443 (commit by michael) arpi_esp parents: 200 diff changeset	686
1765 e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1720 diff changeset	687 #undef RENAME
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1720 diff changeset	688 #undef RENAMEl
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1720 diff changeset	689 #define RENAME(a) a ## _SSE2
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1720 diff changeset	690 #define RENAMEl(a) a ## _sse2
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1720 diff changeset	691 #include "mpegvideo_mmx_template.c"
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1720 diff changeset	692
14 8ceb13af9cb6 renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff glantau parents: 8 diff changeset	693 void MPV_common_init_mmx(MpegEncContext *s)
8 1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	694 {
14 8ceb13af9cb6 renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff glantau parents: 8 diff changeset	695 if (mm_flags & MM_MMX) {
706 e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	696 const int dct_algo = s->avctx->dct_algo;
e65798d228ea idct permutation cleanup, idct can be selected per context now michaelni parents: 687 diff changeset	697
1689 1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	698 s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	699 s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	700 s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	701 s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	702 s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
1a2db2073848 split intra / inter dequantization michael parents: 1661 diff changeset	703 s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
312 8cf5507e6ca5 mpeg4 mpeg quantizer support michaelni parents: 252 diff changeset	704
350 6ebbecc10063 - Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now. pulento parents: 344 diff changeset	705 draw_edges = draw_edges_mmx;
1719 4e72fb256b25 denoise_dct_mmx() michael parents: 1689 diff changeset	706
1720 96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	707 if (mm_flags & MM_SSE2) {
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	708 s->denoise_dct= denoise_dct_sse2;
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	709 } else {
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	710 s->denoise_dct= denoise_dct_mmx;
96a86bd1e0d5 denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1719 diff changeset	711 }
220 0b234715e205 (commit by michael) arpi_esp parents: 206 diff changeset	712
625 bb6a69f9d409 slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG) michaelni parents: 620 diff changeset	713 if(dct_algo==FF_DCT_AUTO \|\| dct_algo==FF_DCT_MMX){
1765 e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1720 diff changeset	714 if(mm_flags & MM_SSE2){
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1720 diff changeset	715 s->dct_quantize= dct_quantize_SSE2;
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1720 diff changeset	716 } else if(mm_flags & MM_MMXEXT){
625 bb6a69f9d409 slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG) michaelni parents: 620 diff changeset	717 s->dct_quantize= dct_quantize_MMX2;
bb6a69f9d409 slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG) michaelni parents: 620 diff changeset	718 } else {
bb6a69f9d409 slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG) michaelni parents: 620 diff changeset	719 s->dct_quantize= dct_quantize_MMX;
bb6a69f9d409 slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG) michaelni parents: 620 diff changeset	720 }
350 6ebbecc10063 - Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now. pulento parents: 344 diff changeset	721 }
14 8ceb13af9cb6 renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff glantau parents: 8 diff changeset	722 }
8 1b4461b5a7fb Sync with mplayer's stuff nickols_k parents: diff changeset	723 }

Mercurial > libavcodec.hg

annotate i386/mpegvideo_mmx.c @ 2422:18b8b2dcc037 libavcodec