annotate i386/dsputil_h264_template_mmx.c @ 3176:babf844e1308 libavcodec

Init simplification and 2% faster wma_decode_block on amd64 with tables use instead of pow().
author banan
date Wed, 08 Mar 2006 09:26:57 +0000
parents 072dbc669253
children 57d31bdbebe8
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2732
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
1 /*
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
2 * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
3 * Loren Merritt
2732
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
4 *
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
5 * This library is free software; you can redistribute it and/or
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
6 * modify it under the terms of the GNU Lesser General Public
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
7 * License as published by the Free Software Foundation; either
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
8 * version 2 of the License, or (at your option) any later version.
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
9 *
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
10 * This library is distributed in the hope that it will be useful,
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
13 * Lesser General Public License for more details.
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
14 *
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
15 * You should have received a copy of the GNU Lesser General Public
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
16 * License along with this library; if not, write to the Free Software
3036
0b546eab515d Update licensing information: The FSF changed postal address.
diego
parents: 2922
diff changeset
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
2732
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
18 */
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
19
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
20 /**
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
21 * MMX optimized version of (put|avg)_h264_chroma_mc8.
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
22 * H264_CHROMA_MC8_TMPL must be defined to the desired function name
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
23 * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
24 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
2732
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
25 */
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
26 static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
27 {
3089
072dbc669253 MSVC-compatible __align8/__align16 declaration
diego
parents: 3036
diff changeset
28 DECLARE_ALIGNED_8(uint64_t, AA);
072dbc669253 MSVC-compatible __align8/__align16 declaration
diego
parents: 3036
diff changeset
29 DECLARE_ALIGNED_8(uint64_t, DD);
2732
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
30 int i;
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
31
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
32 if(y==0 && x==0) {
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
33 /* no filter needed */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
34 H264_CHROMA_MC8_MV0(dst, src, stride, h);
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
35 return;
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
36 }
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
37
2732
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
38 assert(x<8 && y<8 && x>=0 && y>=0);
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
39
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
40 if(y==0)
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
41 {
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
42 /* horizontal filter only */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
43 asm volatile("movd %0, %%mm5\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
44 "punpcklwd %%mm5, %%mm5\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
45 "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
46 "movq %1, %%mm4\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
47 "pxor %%mm7, %%mm7\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
48 "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
49 : : "rm" (x), "m" (ff_pw_8));
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
50
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
51 for(i=0; i<h; i++) {
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
52 asm volatile(
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
53 /* mm0 = src[0..7], mm1 = src[1..8] */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
54 "movq %0, %%mm0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
55 "movq %1, %%mm1\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
56 : : "m" (src[0]), "m" (src[1]));
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
57
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
58 asm volatile(
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
59 /* [mm2,mm3] = A * src[0..7] */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
60 "movq %%mm0, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
61 "punpcklbw %%mm7, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
62 "pmullw %%mm4, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
63 "movq %%mm0, %%mm3\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
64 "punpckhbw %%mm7, %%mm3\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
65 "pmullw %%mm4, %%mm3\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
66
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
67 /* [mm2,mm3] += B * src[1..8] */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
68 "movq %%mm1, %%mm0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
69 "punpcklbw %%mm7, %%mm0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
70 "pmullw %%mm5, %%mm0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
71 "punpckhbw %%mm7, %%mm1\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
72 "pmullw %%mm5, %%mm1\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
73 "paddw %%mm0, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
74 "paddw %%mm1, %%mm3\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
75
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
76 /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
77 "paddw %1, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
78 "paddw %1, %%mm3\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
79 "psrlw $3, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
80 "psrlw $3, %%mm3\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
81 "packuswb %%mm3, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
82 H264_CHROMA_OP(%0, %%mm2)
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
83 "movq %%mm2, %0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
84 : "=m" (dst[0]) : "m" (ff_pw_4));
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
85
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
86 src += stride;
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
87 dst += stride;
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
88 }
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
89 return;
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
90 }
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
91
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
92 if(x==0)
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
93 {
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
94 /* vertical filter only */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
95 asm volatile("movd %0, %%mm6\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
96 "punpcklwd %%mm6, %%mm6\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
97 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = C = y */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
98 "movq %1, %%mm4\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
99 "pxor %%mm7, %%mm7\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
100 "psubw %%mm6, %%mm4\n\t" /* mm4 = A = 8-y */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
101 : : "rm" (y), "m" (ff_pw_8));
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
102
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
103 asm volatile(
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
104 /* mm0 = src[0..7] */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
105 "movq %0, %%mm0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
106 : : "m" (src[0]));
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
107
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
108 for(i=0; i<h; i++) {
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
109 asm volatile(
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
110 /* [mm2,mm3] = A * src[0..7] */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
111 "movq %mm0, %mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
112 "punpcklbw %mm7, %mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
113 "pmullw %mm4, %mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
114 "movq %mm0, %mm3\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
115 "punpckhbw %mm7, %mm3\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
116 "pmullw %mm4, %mm3\n\t");
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
117
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
118 src += stride;
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
119 asm volatile(
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
120 /* mm0 = src[0..7] */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
121 "movq %0, %%mm0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
122 : : "m" (src[0]));
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
123
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
124 asm volatile(
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
125 /* [mm2,mm3] += C * src[0..7] */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
126 "movq %mm0, %mm1\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
127 "punpcklbw %mm7, %mm1\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
128 "pmullw %mm6, %mm1\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
129 "paddw %mm1, %mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
130 "movq %mm0, %mm5\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
131 "punpckhbw %mm7, %mm5\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
132 "pmullw %mm6, %mm5\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
133 "paddw %mm5, %mm3\n\t");
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
134
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
135 asm volatile(
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
136 /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
137 "paddw %1, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
138 "paddw %1, %%mm3\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
139 "psrlw $3, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
140 "psrlw $3, %%mm3\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
141 "packuswb %%mm3, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
142 H264_CHROMA_OP(%0, %%mm2)
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
143 "movq %%mm2, %0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
144 : "=m" (dst[0]) : "m" (ff_pw_4));
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
145
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
146 dst += stride;
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
147 }
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
148 return;
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
149 }
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
150
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
151 /* general case, bilinear */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
152 asm volatile("movd %2, %%mm4\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
153 "movd %3, %%mm6\n\t"
2732
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
154 "punpcklwd %%mm4, %%mm4\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
155 "punpcklwd %%mm6, %%mm6\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
156 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
157 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
158 "movq %%mm4, %%mm5\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
159 "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
160 "psllw $3, %%mm5\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
161 "psllw $3, %%mm6\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
162 "movq %%mm5, %%mm7\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
163 "paddw %%mm6, %%mm7\n\t"
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
164 "movq %%mm4, %1\n\t" /* DD = x * y */
2732
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
165 "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
166 "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
167 "paddw %4, %%mm4\n\t"
2732
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
168 "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
169 "pxor %%mm7, %%mm7\n\t"
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
170 "movq %%mm4, %0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
171 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
2732
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
172
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
173 asm volatile(
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
174 /* mm0 = src[0..7], mm1 = src[1..8] */
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
175 "movq %0, %%mm0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
176 "movq %1, %%mm1\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
177 : : "m" (src[0]), "m" (src[1]));
2732
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
178
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
179 for(i=0; i<h; i++) {
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
180 asm volatile(
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
181 /* [mm2,mm3] = A * src[0..7] */
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
182 "movq %%mm0, %%mm2\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
183 "punpcklbw %%mm7, %%mm2\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
184 "pmullw %0, %%mm2\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
185 "movq %%mm0, %%mm3\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
186 "punpckhbw %%mm7, %%mm3\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
187 "pmullw %0, %%mm3\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
188
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
189 /* [mm2,mm3] += B * src[1..8] */
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
190 "movq %%mm1, %%mm0\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
191 "punpcklbw %%mm7, %%mm0\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
192 "pmullw %%mm5, %%mm0\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
193 "punpckhbw %%mm7, %%mm1\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
194 "pmullw %%mm5, %%mm1\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
195 "paddw %%mm0, %%mm2\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
196 "paddw %%mm1, %%mm3\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
197 : : "m" (AA));
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
198
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
199 src += stride;
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
200 asm volatile(
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
201 /* mm0 = src[0..7], mm1 = src[1..8] */
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
202 "movq %0, %%mm0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
203 "movq %1, %%mm1\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
204 : : "m" (src[0]), "m" (src[1]));
2732
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
205
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
206 asm volatile(
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
207 /* [mm2,mm3] += C * src[0..7] */
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
208 "movq %mm0, %mm4\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
209 "punpcklbw %mm7, %mm4\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
210 "pmullw %mm6, %mm4\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
211 "paddw %mm4, %mm2\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
212 "movq %mm0, %mm4\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
213 "punpckhbw %mm7, %mm4\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
214 "pmullw %mm6, %mm4\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
215 "paddw %mm4, %mm3\n\t");
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
216
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
217 asm volatile(
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
218 /* [mm2,mm3] += D * src[1..8] */
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
219 "movq %%mm1, %%mm4\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
220 "punpcklbw %%mm7, %%mm4\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
221 "pmullw %0, %%mm4\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
222 "paddw %%mm4, %%mm2\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
223 "movq %%mm1, %%mm4\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
224 "punpckhbw %%mm7, %%mm4\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
225 "pmullw %0, %%mm4\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
226 "paddw %%mm4, %%mm3\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
227 : : "m" (DD));
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
228
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
229 asm volatile(
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
230 /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
231 "paddw %1, %%mm2\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
232 "paddw %1, %%mm3\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
233 "psrlw $6, %%mm2\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
234 "psrlw $6, %%mm3\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
235 "packuswb %%mm3, %%mm2\n\t"
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
236 H264_CHROMA_OP(%0, %%mm2)
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
237 "movq %%mm2, %0\n\t"
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents: 2732
diff changeset
238 : "=m" (dst[0]) : "m" (ff_pw_32));
2732
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
239 dst+= stride;
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
240 }
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents:
diff changeset
241 }
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
242
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
243 static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
244 {
3089
072dbc669253 MSVC-compatible __align8/__align16 declaration
diego
parents: 3036
diff changeset
245 DECLARE_ALIGNED_8(uint64_t, AA);
072dbc669253 MSVC-compatible __align8/__align16 declaration
diego
parents: 3036
diff changeset
246 DECLARE_ALIGNED_8(uint64_t, DD);
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
247 int i;
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
248
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
249 /* no special case for mv=(0,0) in 4x*, since it's much less common than in 8x*.
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
250 * could still save a few cycles, but maybe not worth the complexity. */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
251
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
252 assert(x<8 && y<8 && x>=0 && y>=0);
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
253
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
254 asm volatile("movd %2, %%mm4\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
255 "movd %3, %%mm6\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
256 "punpcklwd %%mm4, %%mm4\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
257 "punpcklwd %%mm6, %%mm6\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
258 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
259 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
260 "movq %%mm4, %%mm5\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
261 "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
262 "psllw $3, %%mm5\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
263 "psllw $3, %%mm6\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
264 "movq %%mm5, %%mm7\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
265 "paddw %%mm6, %%mm7\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
266 "movq %%mm4, %1\n\t" /* DD = x * y */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
267 "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
268 "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
269 "paddw %4, %%mm4\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
270 "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
271 "pxor %%mm7, %%mm7\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
272 "movq %%mm4, %0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
273 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
274
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
275 asm volatile(
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
276 /* mm0 = src[0..3], mm1 = src[1..4] */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
277 "movd %0, %%mm0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
278 "movd %1, %%mm1\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
279 "punpcklbw %%mm7, %%mm0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
280 "punpcklbw %%mm7, %%mm1\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
281 : : "m" (src[0]), "m" (src[1]));
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
282
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
283 for(i=0; i<h; i++) {
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
284 asm volatile(
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
285 /* mm2 = A * src[0..3] + B * src[1..4] */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
286 "movq %%mm0, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
287 "pmullw %0, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
288 "pmullw %%mm5, %%mm1\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
289 "paddw %%mm1, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
290 : : "m" (AA));
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
291
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
292 src += stride;
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
293 asm volatile(
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
294 /* mm0 = src[0..3], mm1 = src[1..4] */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
295 "movd %0, %%mm0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
296 "movd %1, %%mm1\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
297 "punpcklbw %%mm7, %%mm0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
298 "punpcklbw %%mm7, %%mm1\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
299 : : "m" (src[0]), "m" (src[1]));
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
300
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
301 asm volatile(
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
302 /* mm2 += C * src[0..3] + D * src[1..4] */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
303 "movq %%mm0, %%mm3\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
304 "movq %%mm1, %%mm4\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
305 "pmullw %%mm6, %%mm3\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
306 "pmullw %0, %%mm4\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
307 "paddw %%mm3, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
308 "paddw %%mm4, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
309 : : "m" (DD));
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
310
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
311 asm volatile(
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
312 /* dst[0..3] = pack((mm2 + 32) >> 6) */
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
313 "paddw %1, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
314 "psrlw $6, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
315 "packuswb %%mm7, %%mm2\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
316 H264_CHROMA_OP4(%0, %%mm2, %%mm3)
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
317 "movd %%mm2, %0\n\t"
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
318 : "=m" (dst[0]) : "m" (ff_pw_32));
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
319 dst += stride;
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
320 }
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2754
diff changeset
321 }