view x86/h264_idct_sse2.asm @ 10952:ea8f891d997d libavcodec

H264 DXVA2 implementation It allows VLD H264 decoding using DXVA2 (GPU assisted decoding API under VISTA and Windows 7). It is implemented by using AVHWAccel API. It has been tested successfully for some time in VLC using an nvidia card on Windows 7. To compile it, you need to have the system header dxva2api.h (either from microsoft or using http://downloads.videolan.org/pub/videolan/testing/contrib/dxva2api.h) The generated libavcodec.dll does not depend directly on any new lib as the necessary objects are given by the application using FFmpeg.
author fenrir
date Wed, 20 Jan 2010 18:54:51 +0000
parents c08ca946c80a
children 980030a3e315
line wrap: on
line source

;*****************************************************************************
;* SSE2-optimized H.264 iDCT
;*****************************************************************************
;* Copyright (C) 2003-2008 x264 project
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
;*          Loren Merritt <lorenm@u.washington.edu>
;*          Holger Lubitz <hal@duncan.ol.sub.de>
;*          Min Chen <chenm001.163.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*****************************************************************************

%include "x86inc.asm"
%include "x86util.asm"

SECTION_RODATA
pw_32: times 8 dw 32

SECTION .text

INIT_XMM
cglobal x264_add8x4_idct_sse2, 3,3,8
    movq   m0, [r1+ 0]
    movq   m1, [r1+ 8]
    movq   m2, [r1+16]
    movq   m3, [r1+24]
    movhps m0, [r1+32]
    movhps m1, [r1+40]
    movhps m2, [r1+48]
    movhps m3, [r1+56]
    IDCT4_1D 0,1,2,3,4,5
    TRANSPOSE2x4x4W 0,1,2,3,4
    paddw m0, [pw_32 GLOBAL]
    IDCT4_1D 0,1,2,3,4,5
    pxor  m7, m7
    STORE_DIFF  m0, m4, m7, [r0]
    STORE_DIFF  m1, m4, m7, [r0+r2]
    lea   r0, [r0+r2*2]
    STORE_DIFF  m2, m4, m7, [r0]
    STORE_DIFF  m3, m4, m7, [r0+r2]
    RET