libavcodec.hg: x86/vp8dsp-init.c annotate

annotate x86/vp8dsp-init.c @ 12340:2d15f62f4f8a libavcodec

VP8: move zeroing of luma DC block into the WHT Lets us do the zeroing in asm instead of C. Also makes it consistent with the way the regular iDCT code does it.

author	darkshikari
date	Mon, 02 Aug 2010 20:18:09 +0000
parents	435319d67bd8
children	3fc4c625b6f3

rev	line source
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	1 /*
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	2 * VP8 DSP functions x86-optimized
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	3 * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	4 * Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	5 *
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	6 * This file is part of FFmpeg.
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	7 *
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	8 * FFmpeg is free software; you can redistribute it and/or
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	9 * modify it under the terms of the GNU Lesser General Public
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	10 * License as published by the Free Software Foundation; either
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	11 * version 2.1 of the License, or (at your option) any later version.
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	12 *
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	13 * FFmpeg is distributed in the hope that it will be useful,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	16 * Lesser General Public License for more details.
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	17 *
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	18 * You should have received a copy of the GNU Lesser General Public
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	19 * License along with FFmpeg; if not, write to the Free Software
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	21 */
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	22
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	23 #include "libavutil/x86_cpu.h"
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	24 #include "libavcodec/vp8dsp.h"
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	25
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	26 #if HAVE_YASM
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	27
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	28 /*
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	29 * MC functions
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	30 */
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	31 extern void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, int dststride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	32 uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	33 int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	34 extern void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, int dststride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	35 uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	36 int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	37 extern void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, int dststride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	38 uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	39 int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	40 extern void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, int dststride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	41 uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	42 int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	43
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	44 extern void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, int dststride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	45 uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	46 int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	47 extern void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, int dststride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	48 uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	49 int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	50 extern void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, int dststride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	51 uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	52 int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	53 extern void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, int dststride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	54 uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	55 int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	56
12054 b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	57 extern void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, int dststride,
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	58 uint8_t *src, int srcstride,
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	59 int height, int mx, int my);
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	60 extern void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, int dststride,
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	61 uint8_t *src, int srcstride,
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	62 int height, int mx, int my);
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	63 extern void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, int dststride,
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	64 uint8_t *src, int srcstride,
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	65 int height, int mx, int my);
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	66 extern void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, int dststride,
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	67 uint8_t *src, int srcstride,
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	68 int height, int mx, int my);
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	69 extern void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, int dststride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	70 uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	71 int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	72 extern void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, int dststride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	73 uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	74 int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	75 extern void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, int dststride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	76 uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	77 int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	78 extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, int dststride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	79 uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	80 int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	81
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	82 extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, int dststride,
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	83 uint8_t *src, int srcstride,
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	84 int height, int mx, int my);
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	85 extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, int dststride,
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	86 uint8_t *src, int srcstride,
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	87 int height, int mx, int my);
12082 8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions darkshikari parents: 12054 diff changeset	88 extern void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, int dststride,
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions darkshikari parents: 12054 diff changeset	89 uint8_t *src, int srcstride,
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions darkshikari parents: 12054 diff changeset	90 int height, int mx, int my);
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions darkshikari parents: 12054 diff changeset	91 extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, int dststride,
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions darkshikari parents: 12054 diff changeset	92 uint8_t *src, int srcstride,
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions darkshikari parents: 12054 diff changeset	93 int height, int mx, int my);
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	94
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	95 extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, int dststride,
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	96 uint8_t *src, int srcstride,
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	97 int height, int mx, int my);
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	98 extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, int dststride,
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	99 uint8_t *src, int srcstride,
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	100 int height, int mx, int my);
12082 8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions darkshikari parents: 12054 diff changeset	101 extern void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, int dststride,
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions darkshikari parents: 12054 diff changeset	102 uint8_t *src, int srcstride,
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions darkshikari parents: 12054 diff changeset	103 int height, int mx, int my);
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	104 extern void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, int dststride,
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	105 uint8_t *src, int srcstride,
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	106 int height, int mx, int my);
12082 8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions darkshikari parents: 12054 diff changeset	107
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	108
11992 da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	109 extern void ff_put_vp8_pixels8_mmx (uint8_t *dst, int dststride,
da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	110 uint8_t *src, int srcstride,
da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	111 int height, int mx, int my);
da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	112 extern void ff_put_vp8_pixels16_mmx(uint8_t *dst, int dststride,
da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	113 uint8_t *src, int srcstride,
da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	114 int height, int mx, int my);
da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	115 extern void ff_put_vp8_pixels16_sse(uint8_t *dst, int dststride,
da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	116 uint8_t *src, int srcstride,
da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	117 int height, int mx, int my);
da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	118
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	119 #define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	120 static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	121 uint8_t dst, int dststride, uint8_t src, \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	122 int srcstride, int height, int mx, int my) \
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	123 { \
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	124 ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	125 dst, dststride, src, srcstride, height, mx, my); \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	126 ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	127 dst + 8, dststride, src + 8, srcstride, height, mx, my); \
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	128 }
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	129 #define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	130 static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	131 uint8_t dst, int dststride, uint8_t src, \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	132 int srcstride, int height, int mx, int my) \
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	133 { \
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	134 ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	135 dst, dststride, src, srcstride, height, mx, my); \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	136 ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	137 dst + 4, dststride, src + 4, srcstride, height, mx, my); \
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	138 }
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	139
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	140 TAP_W8 (mmxext, epel, h4)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	141 TAP_W8 (mmxext, epel, h6)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	142 TAP_W16(mmxext, epel, h6)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	143 TAP_W8 (mmxext, epel, v4)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	144 TAP_W8 (mmxext, epel, v6)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	145 TAP_W16(mmxext, epel, v6)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	146 TAP_W8 (mmxext, bilinear, h)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	147 TAP_W16(mmxext, bilinear, h)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	148 TAP_W8 (mmxext, bilinear, v)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	149 TAP_W16(mmxext, bilinear, v)
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	150
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	151 TAP_W16(sse2, epel, h6)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	152 TAP_W16(sse2, epel, v6)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	153 TAP_W16(sse2, bilinear, h)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	154 TAP_W16(sse2, bilinear, v)
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	155
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	156 TAP_W16(ssse3, epel, h6)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	157 TAP_W16(ssse3, epel, v6)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	158 TAP_W16(ssse3, bilinear, h)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	159 TAP_W16(ssse3, bilinear, v)
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	160
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	161 #define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	162 static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	163 uint8_t dst, int dststride, uint8_t src, \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	164 int srcstride, int height, int mx, int my) \
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	165 { \
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	166 DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	167 uint8_t tmpptr = tmp + SIZE (TAPNUMY / 2 - 1); \
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	168 src -= srcstride * (TAPNUMY / 2 - 1); \
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	169 ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	170 tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	171 ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	172 dst, dststride, tmpptr, SIZE, height, mx, my); \
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	173 }
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	174
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	175 #define HVTAPMMX(x, y) \
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	176 HVTAP(mmxext, 8, x, y, 4, 8) \
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	177 HVTAP(mmxext, 8, x, y, 8, 16)
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	178
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	179 HVTAPMMX(4, 4)
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	180 HVTAPMMX(4, 6)
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	181 HVTAPMMX(6, 4)
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	182 HVTAPMMX(6, 6)
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	183 HVTAP(mmxext, 8, 6, 6, 16, 16)
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	184
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	185 #define HVTAPSSE2(x, y, w) \
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	186 HVTAP(sse2, 16, x, y, w, 16) \
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	187 HVTAP(ssse3, 16, x, y, w, 16)
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	188
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	189 HVTAPSSE2(4, 4, 8)
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	190 HVTAPSSE2(4, 6, 8)
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	191 HVTAPSSE2(6, 4, 8)
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	192 HVTAPSSE2(6, 6, 8)
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	193 HVTAPSSE2(6, 6, 16)
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	194
12054 b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	195 HVTAP(ssse3, 16, 4, 4, 4, 8)
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	196 HVTAP(ssse3, 16, 4, 6, 4, 8)
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	197 HVTAP(ssse3, 16, 6, 4, 4, 8)
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	198 HVTAP(ssse3, 16, 6, 6, 4, 8)
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	199
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	200 #define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	201 static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	202 uint8_t dst, int dststride, uint8_t src, \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	203 int srcstride, int height, int mx, int my) \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	204 { \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	205 DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	206 ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	207 tmp, SIZE, src, srcstride, height + 1, mx, my); \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	208 ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	209 dst, dststride, tmp, SIZE, height, mx, my); \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	210 }
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	211
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	212 HVBILIN(mmxext, 8, 4, 8)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	213 HVBILIN(mmxext, 8, 8, 16)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	214 HVBILIN(mmxext, 8, 16, 16)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	215 HVBILIN(sse2, 8, 8, 16)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	216 HVBILIN(sse2, 8, 16, 16)
12082 8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions darkshikari parents: 12054 diff changeset	217 HVBILIN(ssse3, 8, 4, 8)
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	218 HVBILIN(ssse3, 8, 8, 16)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	219 HVBILIN(ssse3, 8, 16, 16)
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	220
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	221 extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	222 extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
12241 c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma. darkshikari parents: 12238 diff changeset	223 extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16], int stride);
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma. darkshikari parents: 12238 diff changeset	224 extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16], int stride);
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma. darkshikari parents: 12238 diff changeset	225 extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16], int stride);
12209 9eef00a43280 Make mmx VP8 WHT faster darkshikari parents: 12205 diff changeset	226 extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
12340 2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT darkshikari parents: 12334 diff changeset	227 extern void ff_vp8_luma_dc_wht_sse(DCTELEM block[4][4][16], DCTELEM dc[16]);
12013 2ae70e2c31a4 MMX idct_add for VP8. rbultje parents: 12006 diff changeset	228 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
12235 e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks. darkshikari parents: 12227 diff changeset	229 extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride);
12086 d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros). rbultje parents: 12082 diff changeset	230
12210 baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	231 #define DECLARE_LOOP_FILTER(NAME)\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	232 extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	233 extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	234 extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	235 int e, int i, int hvt);\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	236 extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	237 int e, int i, int hvt);\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	238 extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t dstU, uint8_t dstV,\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	239 int s, int e, int i, int hvt);\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	240 extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t dstU, uint8_t dstV,\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	241 int s, int e, int i, int hvt);\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	242 extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	243 int e, int i, int hvt);\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	244 extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	245 int e, int i, int hvt);\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	246 extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t dstU, uint8_t dstV,\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	247 int s, int e, int i, int hvt);\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	248 extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t dstU, uint8_t dstV,\
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	249 int s, int e, int i, int hvt);
12204 563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder. rbultje parents: 12198 diff changeset	250
12210 baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	251 DECLARE_LOOP_FILTER(mmx)
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	252 DECLARE_LOOP_FILTER(mmxext)
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	253 DECLARE_LOOP_FILTER(sse2)
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	254 DECLARE_LOOP_FILTER(ssse3)
12227 d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on rbultje parents: 12214 diff changeset	255 DECLARE_LOOP_FILTER(sse4)
12205 d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16) rbultje parents: 12204 diff changeset	256
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	257 #endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	258
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	259 #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	260 c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	261 c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	262 c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	263
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	264 #define VP8_MC_FUNC(IDX, SIZE, OPT) \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	265 c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	266 c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	267 c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	268 c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	269 c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	270 VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	271
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	272 #define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	273 c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	274 c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	275 c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	276 c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	277 c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	278 c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	279 c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	280 c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	281
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	282
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	283 av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	284 {
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	285 mm_flags = mm_support();
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	286
11976 19374f2992bf Fix build without yasm conrad parents: 11975 diff changeset	287 #if HAVE_YASM
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	288 if (mm_flags & FF_MM_MMX) {
12241 c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma. darkshikari parents: 12238 diff changeset	289 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma. darkshikari parents: 12238 diff changeset	290 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma. darkshikari parents: 12238 diff changeset	291 c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma. darkshikari parents: 12238 diff changeset	292 c->vp8_idct_add = ff_vp8_idct_add_mmx;
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma. darkshikari parents: 12238 diff changeset	293 c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
11992 da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	294 c->put_vp8_epel_pixels_tab[0][0][0] =
da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	295 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	296 c->put_vp8_epel_pixels_tab[1][0][0] =
da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	297 c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
12086 d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros). rbultje parents: 12082 diff changeset	298
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros). rbultje parents: 12082 diff changeset	299 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros). rbultje parents: 12082 diff changeset	300 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
12168 b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations. rbultje parents: 12086 diff changeset	301
12194 80b142c2e9f7 Change function prototypes for width=8 inner and mbedge loopfilter functions rbultje parents: 12168 diff changeset	302 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
80b142c2e9f7 Change function prototypes for width=8 inner and mbedge loopfilter functions rbultje parents: 12168 diff changeset	303 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
12204 563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder. rbultje parents: 12198 diff changeset	304 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder. rbultje parents: 12198 diff changeset	305 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
12205 d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16) rbultje parents: 12204 diff changeset	306
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16) rbultje parents: 12204 diff changeset	307 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx;
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16) rbultje parents: 12204 diff changeset	308 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx;
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16) rbultje parents: 12204 diff changeset	309 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx;
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16) rbultje parents: 12204 diff changeset	310 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx;
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	311 }
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	312
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	313 /* note that 4-tap width=16 functions are missing because w=16
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	314 * is only used for luma, and luma is always a copy or sixtap. */
11993 c15e87b9767b Change MMXEXT to MMX2, MMXEXT is deprecated bcoudurier parents: 11992 diff changeset	315 if (mm_flags & FF_MM_MMX2) {
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	316 VP8_LUMA_MC_FUNC(0, 16, mmxext);
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	317 VP8_MC_FUNC(1, 8, mmxext);
12042 dc4feabd4dab Fix 100L in vp8dsp asm init darkshikari parents: 12013 diff changeset	318 VP8_MC_FUNC(2, 4, mmxext);
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	319 VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	320 VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
12042 dc4feabd4dab Fix 100L in vp8dsp asm init darkshikari parents: 12013 diff changeset	321 VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
12086 d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros). rbultje parents: 12082 diff changeset	322
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros). rbultje parents: 12082 diff changeset	323 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros). rbultje parents: 12082 diff changeset	324 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
12168 b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations. rbultje parents: 12086 diff changeset	325
12194 80b142c2e9f7 Change function prototypes for width=8 inner and mbedge loopfilter functions rbultje parents: 12168 diff changeset	326 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
80b142c2e9f7 Change function prototypes for width=8 inner and mbedge loopfilter functions rbultje parents: 12168 diff changeset	327 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
12204 563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder. rbultje parents: 12198 diff changeset	328 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder. rbultje parents: 12198 diff changeset	329 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
12205 d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16) rbultje parents: 12204 diff changeset	330
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16) rbultje parents: 12204 diff changeset	331 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext;
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16) rbultje parents: 12204 diff changeset	332 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16) rbultje parents: 12204 diff changeset	333 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16) rbultje parents: 12204 diff changeset	334 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	335 }
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	336
11992 da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	337 if (mm_flags & FF_MM_SSE) {
12235 e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks. darkshikari parents: 12227 diff changeset	338 c->vp8_idct_add = ff_vp8_idct_add_sse;
12340 2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT darkshikari parents: 12334 diff changeset	339 c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
11992 da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	340 c->put_vp8_epel_pixels_tab[0][0][0] =
da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	341 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	342 }
da388061b227 Add x86 asm functions for VP8 put_pixels darkshikari parents: 11991 diff changeset	343
12197 fbf4d5b1b664 Remove FF_MM_SSE2/3 flags for CPUs where this is generally not faster than rbultje parents: 12196 diff changeset	344 if (mm_flags & (FF_MM_SSE2\|FF_MM_SSE2SLOW)) {
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	345 VP8_LUMA_MC_FUNC(0, 16, sse2);
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	346 VP8_MC_FUNC(1, 8, sse2);
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	347 VP8_BILINEAR_MC_FUNC(0, 16, sse2);
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	348 VP8_BILINEAR_MC_FUNC(1, 8, sse2);
12086 d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros). rbultje parents: 12082 diff changeset	349
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros). rbultje parents: 12082 diff changeset	350 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
12168 b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations. rbultje parents: 12086 diff changeset	351
12194 80b142c2e9f7 Change function prototypes for width=8 inner and mbedge loopfilter functions rbultje parents: 12168 diff changeset	352 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
12204 563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder. rbultje parents: 12198 diff changeset	353 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
12205 d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16) rbultje parents: 12204 diff changeset	354
12214 657d353cd515 Fix and enable horizontal >=SSE2 mbedge loopfilter. rbultje parents: 12210 diff changeset	355 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
657d353cd515 Fix and enable horizontal >=SSE2 mbedge loopfilter. rbultje parents: 12210 diff changeset	356 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
12197 fbf4d5b1b664 Remove FF_MM_SSE2/3 flags for CPUs where this is generally not faster than rbultje parents: 12196 diff changeset	357 }
fbf4d5b1b664 Remove FF_MM_SSE2/3 flags for CPUs where this is generally not faster than rbultje parents: 12196 diff changeset	358
fbf4d5b1b664 Remove FF_MM_SSE2/3 flags for CPUs where this is generally not faster than rbultje parents: 12196 diff changeset	359 if (mm_flags & FF_MM_SSE2) {
12241 c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma. darkshikari parents: 12238 diff changeset	360 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
12238 1a7903913e9b VP8: 30% faster idct_mb darkshikari parents: 12235 diff changeset	361
12334 435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise rbultje parents: 12241 diff changeset	362 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise rbultje parents: 12241 diff changeset	363
12194 80b142c2e9f7 Change function prototypes for width=8 inner and mbedge loopfilter functions rbultje parents: 12168 diff changeset	364 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
12204 563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder. rbultje parents: 12198 diff changeset	365 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
12205 d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16) rbultje parents: 12204 diff changeset	366
12214 657d353cd515 Fix and enable horizontal >=SSE2 mbedge loopfilter. rbultje parents: 12210 diff changeset	367 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
657d353cd515 Fix and enable horizontal >=SSE2 mbedge loopfilter. rbultje parents: 12210 diff changeset	368 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	369 }
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	370
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	371 if (mm_flags & FF_MM_SSSE3) {
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	372 VP8_LUMA_MC_FUNC(0, 16, ssse3);
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	373 VP8_MC_FUNC(1, 8, ssse3);
12054 b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions darkshikari parents: 12042 diff changeset	374 VP8_MC_FUNC(2, 4, ssse3);
11991 a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	375 VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC darkshikari parents: 11976 diff changeset	376 VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
12082 8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions darkshikari parents: 12054 diff changeset	377 VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
12210 baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	378
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	379 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	380 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	381
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	382 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	383 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	384 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	385 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	386
baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	387 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
12214 657d353cd515 Fix and enable horizontal >=SSE2 mbedge loopfilter. rbultje parents: 12210 diff changeset	388 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
12210 baf13deed97e Various VP8 x86 deblocking speedups darkshikari parents: 12209 diff changeset	389 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
12214 657d353cd515 Fix and enable horizontal >=SSE2 mbedge loopfilter. rbultje parents: 12210 diff changeset	390 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	391 }
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	392
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	393 if (mm_flags & FF_MM_SSE4) {
c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	394 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
12227 d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on rbultje parents: 12214 diff changeset	395
12334 435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise rbultje parents: 12241 diff changeset	396 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
12227 d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on rbultje parents: 12214 diff changeset	397 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on rbultje parents: 12214 diff changeset	398 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	399 }
11976 19374f2992bf Fix build without yasm conrad parents: 11975 diff changeset	400 #endif
11975 c3afb5be0d9b First shot at VP8 optimizations: rbultje parents: diff changeset	401 }

Mercurial > libavcodec.hg

annotate x86/vp8dsp-init.c @ 12340:2d15f62f4f8a libavcodec