Mercurial > libavcodec.hg
view i386/mpegvideo_mmx.c @ 5876:731ee5ad6bde libavcodec
Correct assignment of interlaced_frame; was being set on output frames,
in display order, based on decoding information in decoding order. Now
set properly, immediately upon completion of decode.
Based on original patch from Reinhard Nissl, rnisssl % gmx , de
Original Thread: [FFmpeg-devel] H.264 + PAFF: BBC HD recording shows
extreme interlacing artefacts, Thu, 01 Nov 2007 22:43:09
author | heydowns |
---|---|
date | Mon, 05 Nov 2007 18:16:42 +0000 |
parents | 470601203f44 |
children | 55251379b5b1 |
line wrap: on
line source
/* * The simplest mpeg encoder (well, it was the simplest!) * Copyright (c) 2000,2001 Fabrice Bellard. * * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "dsputil.h" #include "mpegvideo.h" #include "avcodec.h" #include "x86_cpu.h" extern uint16_t inv_zigzag_direct16[64]; static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { long level, qmul, qadd, nCoeffs; qmul = qscale << 1; assert(s->block_last_index[n]>=0 || s->h263_aic); if (!s->h263_aic) { if (n < 4) level = block[0] * s->y_dc_scale; else level = block[0] * s->c_dc_scale; qadd = (qscale - 1) | 1; }else{ qadd = 0; level= block[0]; } if(s->ac_pred) nCoeffs=63; else nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; //printf("%d %d ", qmul, qadd); asm volatile( "movd %1, %%mm6 \n\t" //qmul "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "movd %2, %%mm5 \n\t" //qadd "pxor %%mm7, %%mm7 \n\t" "packssdw %%mm5, %%mm5 \n\t" "packssdw %%mm5, %%mm5 \n\t" "psubw %%mm5, %%mm7 \n\t" "pxor %%mm4, %%mm4 \n\t" ASMALIGN(4) "1: \n\t" "movq (%0, %3), %%mm0 \n\t" "movq 8(%0, %3), %%mm1 \n\t" "pmullw %%mm6, %%mm0 \n\t" "pmullw %%mm6, %%mm1 \n\t" "movq (%0, %3), %%mm2 \n\t" "movq 8(%0, %3), %%mm3 \n\t" "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "paddw %%mm7, %%mm0 \n\t" "paddw %%mm7, %%mm1 \n\t" "pxor %%mm0, %%mm2 \n\t" "pxor %%mm1, %%mm3 \n\t" "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 "pandn %%mm2, %%mm0 \n\t" "pandn %%mm3, %%mm1 \n\t" "movq %%mm0, (%0, %3) \n\t" "movq %%mm1, 8(%0, %3) \n\t" "add $16, %3 \n\t" "jng 1b \n\t" ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) : "memory" ); block[0]= level; } static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { long qmul, qadd, nCoeffs; qmul = qscale << 1; qadd = (qscale - 1) | 1; assert(s->block_last_index[n]>=0 || s->h263_aic); nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; //printf("%d %d ", qmul, qadd); asm volatile( "movd %1, %%mm6 \n\t" //qmul "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "movd %2, %%mm5 \n\t" //qadd "pxor %%mm7, %%mm7 \n\t" "packssdw %%mm5, %%mm5 \n\t" "packssdw %%mm5, %%mm5 \n\t" "psubw %%mm5, %%mm7 \n\t" "pxor %%mm4, %%mm4 \n\t" ASMALIGN(4) "1: \n\t" "movq (%0, %3), %%mm0 \n\t" "movq 8(%0, %3), %%mm1 \n\t" "pmullw %%mm6, %%mm0 \n\t" "pmullw %%mm6, %%mm1 \n\t" "movq (%0, %3), %%mm2 \n\t" "movq 8(%0, %3), %%mm3 \n\t" "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "paddw %%mm7, %%mm0 \n\t" "paddw %%mm7, %%mm1 \n\t" "pxor %%mm0, %%mm2 \n\t" "pxor %%mm1, %%mm3 \n\t" "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 "pandn %%mm2, %%mm0 \n\t" "pandn %%mm3, %%mm1 \n\t" "movq %%mm0, (%0, %3) \n\t" "movq %%mm1, 8(%0, %3) \n\t" "add $16, %3 \n\t" "jng 1b \n\t" ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) : "memory" ); } /* NK: Note: looking at PARANOID: "enable all paranoid tests for rounding, overflows, etc..." #ifdef PARANOID if (level < -2048 || level > 2047) fprintf(stderr, "unquant error %d %d\n", i, level); #endif We can suppose that result of two multiplications can't be greate of 0xFFFF i.e. is 16-bit, so we use here only PMULLW instruction and can avoid a complex multiplication. ===================================================== Full formula for multiplication of 2 integer numbers which are represent as high:low words: input: value1 = high1:low1 value2 = high2:low2 output: value3 = value1*value2 value3=high3:low3 (on overflow: modulus 2^32 wrap-around) this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4 but this algorithm will compute only 0x66cb0ce4 this limited by 16-bit size of operands --------------------------------- tlow1 = high1*low2 tlow2 = high2*low1 tlow1 = tlow1 + tlow2 high3:low3 = low1*low2 high3 += tlow1 */ static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { long nCoeffs; const uint16_t *quant_matrix; int block0; assert(s->block_last_index[n]>=0); nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; if (n < 4) block0 = block[0] * s->y_dc_scale; else block0 = block[0] * s->c_dc_scale; /* XXX: only mpeg1 */ quant_matrix = s->intra_matrix; asm volatile( "pcmpeqw %%mm7, %%mm7 \n\t" "psrlw $15, %%mm7 \n\t" "movd %2, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "mov %3, %%"REG_a" \n\t" ASMALIGN(4) "1: \n\t" "movq (%0, %%"REG_a"), %%mm0 \n\t" "movq 8(%0, %%"REG_a"), %%mm1 \n\t" "movq (%1, %%"REG_a"), %%mm4 \n\t" "movq 8(%1, %%"REG_a"), %%mm5 \n\t" "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] "pxor %%mm2, %%mm2 \n\t" "pxor %%mm3, %%mm3 \n\t" "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q "pxor %%mm4, %%mm4 \n\t" "pxor %%mm5, %%mm5 \n\t" // FIXME slow "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 "psraw $3, %%mm0 \n\t" "psraw $3, %%mm1 \n\t" "psubw %%mm7, %%mm0 \n\t" "psubw %%mm7, %%mm1 \n\t" "por %%mm7, %%mm0 \n\t" "por %%mm7, %%mm1 \n\t" "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "psubw %%mm2, %%mm0 \n\t" "psubw %%mm3, %%mm1 \n\t" "pandn %%mm0, %%mm4 \n\t" "pandn %%mm1, %%mm5 \n\t" "movq %%mm4, (%0, %%"REG_a") \n\t" "movq %%mm5, 8(%0, %%"REG_a") \n\t" "add $16, %%"REG_a" \n\t" "js 1b \n\t" ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) : "%"REG_a, "memory" ); block[0]= block0; } static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { long nCoeffs; const uint16_t *quant_matrix; assert(s->block_last_index[n]>=0); nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; quant_matrix = s->inter_matrix; asm volatile( "pcmpeqw %%mm7, %%mm7 \n\t" "psrlw $15, %%mm7 \n\t" "movd %2, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "mov %3, %%"REG_a" \n\t" ASMALIGN(4) "1: \n\t" "movq (%0, %%"REG_a"), %%mm0 \n\t" "movq 8(%0, %%"REG_a"), %%mm1 \n\t" "movq (%1, %%"REG_a"), %%mm4 \n\t" "movq 8(%1, %%"REG_a"), %%mm5 \n\t" "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] "pxor %%mm2, %%mm2 \n\t" "pxor %%mm3, %%mm3 \n\t" "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q "pxor %%mm4, %%mm4 \n\t" "pxor %%mm5, %%mm5 \n\t" // FIXME slow "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 "psraw $4, %%mm0 \n\t" "psraw $4, %%mm1 \n\t" "psubw %%mm7, %%mm0 \n\t" "psubw %%mm7, %%mm1 \n\t" "por %%mm7, %%mm0 \n\t" "por %%mm7, %%mm1 \n\t" "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "psubw %%mm2, %%mm0 \n\t" "psubw %%mm3, %%mm1 \n\t" "pandn %%mm0, %%mm4 \n\t" "pandn %%mm1, %%mm5 \n\t" "movq %%mm4, (%0, %%"REG_a") \n\t" "movq %%mm5, 8(%0, %%"REG_a") \n\t" "add $16, %%"REG_a" \n\t" "js 1b \n\t" ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) : "%"REG_a, "memory" ); } static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { long nCoeffs; const uint16_t *quant_matrix; int block0; assert(s->block_last_index[n]>=0); if(s->alternate_scan) nCoeffs= 63; //FIXME else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; if (n < 4) block0 = block[0] * s->y_dc_scale; else block0 = block[0] * s->c_dc_scale; quant_matrix = s->intra_matrix; asm volatile( "pcmpeqw %%mm7, %%mm7 \n\t" "psrlw $15, %%mm7 \n\t" "movd %2, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "mov %3, %%"REG_a" \n\t" ASMALIGN(4) "1: \n\t" "movq (%0, %%"REG_a"), %%mm0 \n\t" "movq 8(%0, %%"REG_a"), %%mm1 \n\t" "movq (%1, %%"REG_a"), %%mm4 \n\t" "movq 8(%1, %%"REG_a"), %%mm5 \n\t" "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] "pxor %%mm2, %%mm2 \n\t" "pxor %%mm3, %%mm3 \n\t" "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q "pxor %%mm4, %%mm4 \n\t" "pxor %%mm5, %%mm5 \n\t" // FIXME slow "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 "psraw $3, %%mm0 \n\t" "psraw $3, %%mm1 \n\t" "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "psubw %%mm2, %%mm0 \n\t" "psubw %%mm3, %%mm1 \n\t" "pandn %%mm0, %%mm4 \n\t" "pandn %%mm1, %%mm5 \n\t" "movq %%mm4, (%0, %%"REG_a") \n\t" "movq %%mm5, 8(%0, %%"REG_a") \n\t" "add $16, %%"REG_a" \n\t" "jng 1b \n\t" ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) : "%"REG_a, "memory" ); block[0]= block0; //Note, we do not do mismatch control for intra as errors cannot accumulate } static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { long nCoeffs; const uint16_t *quant_matrix; assert(s->block_last_index[n]>=0); if(s->alternate_scan) nCoeffs= 63; //FIXME else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; quant_matrix = s->inter_matrix; asm volatile( "pcmpeqw %%mm7, %%mm7 \n\t" "psrlq $48, %%mm7 \n\t" "movd %2, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "mov %3, %%"REG_a" \n\t" ASMALIGN(4) "1: \n\t" "movq (%0, %%"REG_a"), %%mm0 \n\t" "movq 8(%0, %%"REG_a"), %%mm1 \n\t" "movq (%1, %%"REG_a"), %%mm4 \n\t" "movq 8(%1, %%"REG_a"), %%mm5 \n\t" "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] "pxor %%mm2, %%mm2 \n\t" "pxor %%mm3, %%mm3 \n\t" "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q "pxor %%mm4, %%mm4 \n\t" "pxor %%mm5, %%mm5 \n\t" // FIXME slow "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 "psrlw $4, %%mm0 \n\t" "psrlw $4, %%mm1 \n\t" "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "psubw %%mm2, %%mm0 \n\t" "psubw %%mm3, %%mm1 \n\t" "pandn %%mm0, %%mm4 \n\t" "pandn %%mm1, %%mm5 \n\t" "pxor %%mm4, %%mm7 \n\t" "pxor %%mm5, %%mm7 \n\t" "movq %%mm4, (%0, %%"REG_a") \n\t" "movq %%mm5, 8(%0, %%"REG_a") \n\t" "add $16, %%"REG_a" \n\t" "jng 1b \n\t" "movd 124(%0, %3), %%mm0 \n\t" "movq %%mm7, %%mm6 \n\t" "psrlq $32, %%mm7 \n\t" "pxor %%mm6, %%mm7 \n\t" "movq %%mm7, %%mm6 \n\t" "psrlq $16, %%mm7 \n\t" "pxor %%mm6, %%mm7 \n\t" "pslld $31, %%mm7 \n\t" "psrlq $15, %%mm7 \n\t" "pxor %%mm7, %%mm0 \n\t" "movd %%mm0, 124(%0, %3) \n\t" ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs) : "%"REG_a, "memory" ); } /* draw the edges of width 'w' of an image of size width, height this mmx version can only handle w==8 || w==16 */ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) { uint8_t *ptr, *last_line; int i; last_line = buf + (height - 1) * wrap; /* left and right */ ptr = buf; if(w==8) { asm volatile( "1: \n\t" "movd (%0), %%mm0 \n\t" "punpcklbw %%mm0, %%mm0 \n\t" "punpcklwd %%mm0, %%mm0 \n\t" "punpckldq %%mm0, %%mm0 \n\t" "movq %%mm0, -8(%0) \n\t" "movq -8(%0, %2), %%mm1 \n\t" "punpckhbw %%mm1, %%mm1 \n\t" "punpckhwd %%mm1, %%mm1 \n\t" "punpckhdq %%mm1, %%mm1 \n\t" "movq %%mm1, (%0, %2) \n\t" "add %1, %0 \n\t" "cmp %3, %0 \n\t" " jb 1b \n\t" : "+r" (ptr) : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) ); } else { asm volatile( "1: \n\t" "movd (%0), %%mm0 \n\t" "punpcklbw %%mm0, %%mm0 \n\t" "punpcklwd %%mm0, %%mm0 \n\t" "punpckldq %%mm0, %%mm0 \n\t" "movq %%mm0, -8(%0) \n\t" "movq %%mm0, -16(%0) \n\t" "movq -8(%0, %2), %%mm1 \n\t" "punpckhbw %%mm1, %%mm1 \n\t" "punpckhwd %%mm1, %%mm1 \n\t" "punpckhdq %%mm1, %%mm1 \n\t" "movq %%mm1, (%0, %2) \n\t" "movq %%mm1, 8(%0, %2) \n\t" "add %1, %0 \n\t" "cmp %3, %0 \n\t" " jb 1b \n\t" : "+r" (ptr) : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) ); } for(i=0;i<w;i+=4) { /* top and bottom (and hopefully also the corners) */ ptr= buf - (i + 1) * wrap - w; asm volatile( "1: \n\t" "movq (%1, %0), %%mm0 \n\t" "movq %%mm0, (%0) \n\t" "movq %%mm0, (%0, %2) \n\t" "movq %%mm0, (%0, %2, 2) \n\t" "movq %%mm0, (%0, %3) \n\t" "add $8, %0 \n\t" "cmp %4, %0 \n\t" " jb 1b \n\t" : "+r" (ptr) : "r" ((long)buf - (long)ptr - w), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (ptr+width+2*w) ); ptr= last_line + (i + 1) * wrap - w; asm volatile( "1: \n\t" "movq (%1, %0), %%mm0 \n\t" "movq %%mm0, (%0) \n\t" "movq %%mm0, (%0, %2) \n\t" "movq %%mm0, (%0, %2, 2) \n\t" "movq %%mm0, (%0, %3) \n\t" "add $8, %0 \n\t" "cmp %4, %0 \n\t" " jb 1b \n\t" : "+r" (ptr) : "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w) ); } } static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ const int intra= s->mb_intra; int *sum= s->dct_error_sum[intra]; uint16_t *offset= s->dct_offset[intra]; s->dct_count[intra]++; asm volatile( "pxor %%mm7, %%mm7 \n\t" "1: \n\t" "pxor %%mm0, %%mm0 \n\t" "pxor %%mm1, %%mm1 \n\t" "movq (%0), %%mm2 \n\t" "movq 8(%0), %%mm3 \n\t" "pcmpgtw %%mm2, %%mm0 \n\t" "pcmpgtw %%mm3, %%mm1 \n\t" "pxor %%mm0, %%mm2 \n\t" "pxor %%mm1, %%mm3 \n\t" "psubw %%mm0, %%mm2 \n\t" "psubw %%mm1, %%mm3 \n\t" "movq %%mm2, %%mm4 \n\t" "movq %%mm3, %%mm5 \n\t" "psubusw (%2), %%mm2 \n\t" "psubusw 8(%2), %%mm3 \n\t" "pxor %%mm0, %%mm2 \n\t" "pxor %%mm1, %%mm3 \n\t" "psubw %%mm0, %%mm2 \n\t" "psubw %%mm1, %%mm3 \n\t" "movq %%mm2, (%0) \n\t" "movq %%mm3, 8(%0) \n\t" "movq %%mm4, %%mm2 \n\t" "movq %%mm5, %%mm3 \n\t" "punpcklwd %%mm7, %%mm4 \n\t" "punpckhwd %%mm7, %%mm2 \n\t" "punpcklwd %%mm7, %%mm5 \n\t" "punpckhwd %%mm7, %%mm3 \n\t" "paddd (%1), %%mm4 \n\t" "paddd 8(%1), %%mm2 \n\t" "paddd 16(%1), %%mm5 \n\t" "paddd 24(%1), %%mm3 \n\t" "movq %%mm4, (%1) \n\t" "movq %%mm2, 8(%1) \n\t" "movq %%mm5, 16(%1) \n\t" "movq %%mm3, 24(%1) \n\t" "add $16, %0 \n\t" "add $32, %1 \n\t" "add $16, %2 \n\t" "cmp %3, %0 \n\t" " jb 1b \n\t" : "+r" (block), "+r" (sum), "+r" (offset) : "r"(block+64) ); } static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ const int intra= s->mb_intra; int *sum= s->dct_error_sum[intra]; uint16_t *offset= s->dct_offset[intra]; s->dct_count[intra]++; asm volatile( "pxor %%xmm7, %%xmm7 \n\t" "1: \n\t" "pxor %%xmm0, %%xmm0 \n\t" "pxor %%xmm1, %%xmm1 \n\t" "movdqa (%0), %%xmm2 \n\t" "movdqa 16(%0), %%xmm3 \n\t" "pcmpgtw %%xmm2, %%xmm0 \n\t" "pcmpgtw %%xmm3, %%xmm1 \n\t" "pxor %%xmm0, %%xmm2 \n\t" "pxor %%xmm1, %%xmm3 \n\t" "psubw %%xmm0, %%xmm2 \n\t" "psubw %%xmm1, %%xmm3 \n\t" "movdqa %%xmm2, %%xmm4 \n\t" "movdqa %%xmm3, %%xmm5 \n\t" "psubusw (%2), %%xmm2 \n\t" "psubusw 16(%2), %%xmm3 \n\t" "pxor %%xmm0, %%xmm2 \n\t" "pxor %%xmm1, %%xmm3 \n\t" "psubw %%xmm0, %%xmm2 \n\t" "psubw %%xmm1, %%xmm3 \n\t" "movdqa %%xmm2, (%0) \n\t" "movdqa %%xmm3, 16(%0) \n\t" "movdqa %%xmm4, %%xmm6 \n\t" "movdqa %%xmm5, %%xmm0 \n\t" "punpcklwd %%xmm7, %%xmm4 \n\t" "punpckhwd %%xmm7, %%xmm6 \n\t" "punpcklwd %%xmm7, %%xmm5 \n\t" "punpckhwd %%xmm7, %%xmm0 \n\t" "paddd (%1), %%xmm4 \n\t" "paddd 16(%1), %%xmm6 \n\t" "paddd 32(%1), %%xmm5 \n\t" "paddd 48(%1), %%xmm0 \n\t" "movdqa %%xmm4, (%1) \n\t" "movdqa %%xmm6, 16(%1) \n\t" "movdqa %%xmm5, 32(%1) \n\t" "movdqa %%xmm0, 48(%1) \n\t" "add $32, %0 \n\t" "add $64, %1 \n\t" "add $32, %2 \n\t" "cmp %3, %0 \n\t" " jb 1b \n\t" : "+r" (block), "+r" (sum), "+r" (offset) : "r"(block+64) ); } #ifdef HAVE_SSSE3 #define HAVE_SSSE3_BAK #endif #undef HAVE_SSSE3 #undef HAVE_SSE2 #undef HAVE_MMX2 #define RENAME(a) a ## _MMX #define RENAMEl(a) a ## _mmx #include "mpegvideo_mmx_template.c" #define HAVE_MMX2 #undef RENAME #undef RENAMEl #define RENAME(a) a ## _MMX2 #define RENAMEl(a) a ## _mmx2 #include "mpegvideo_mmx_template.c" #define HAVE_SSE2 #undef RENAME #undef RENAMEl #define RENAME(a) a ## _SSE2 #define RENAMEl(a) a ## _sse2 #include "mpegvideo_mmx_template.c" #ifdef HAVE_SSSE3_BAK #define HAVE_SSSE3 #undef RENAME #undef RENAMEl #define RENAME(a) a ## _SSSE3 #define RENAMEl(a) a ## _sse2 #include "mpegvideo_mmx_template.c" #endif void MPV_common_init_mmx(MpegEncContext *s) { if (mm_flags & MM_MMX) { const int dct_algo = s->avctx->dct_algo; s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; if(!(s->flags & CODEC_FLAG_BITEXACT)) s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; draw_edges = draw_edges_mmx; if (mm_flags & MM_SSE2) { s->denoise_dct= denoise_dct_sse2; } else { s->denoise_dct= denoise_dct_mmx; } if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ #ifdef HAVE_SSSE3 if(mm_flags & MM_SSSE3){ s->dct_quantize= dct_quantize_SSSE3; } else #endif if(mm_flags & MM_SSE2){ s->dct_quantize= dct_quantize_SSE2; } else if(mm_flags & MM_MMXEXT){ s->dct_quantize= dct_quantize_MMX2; } else { s->dct_quantize= dct_quantize_MMX; } } } }