annotate i386/dsputil_h264_template_ssse3.c @ 7900:37f62a3dc645 libavcodec

Correct comment in the direct mode code. (note, yes this is unrelated to the previous simplification, the code always behaved like it is documented now.)
author michael
date Sun, 21 Sep 2008 23:49:53 +0000
parents 33896780c612
children eebc7209c47f
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6557
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
1 /*
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
2 * Copyright (c) 2008 Loren Merritt
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
3 *
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
4 * This file is part of FFmpeg.
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
5 *
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
6 * FFmpeg is free software; you can redistribute it and/or
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
8 * License as published by the Free Software Foundation; either
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
9 * version 2.1 of the License, or (at your option) any later version.
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
10 *
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
11 * FFmpeg is distributed in the hope that it will be useful,
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
14 * Lesser General Public License for more details.
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
15 *
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
17 * License along with FFmpeg; if not, write to the Free Software
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
19 */
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
20
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
21 /**
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
22 * SSSE3 optimized version of (put|avg)_h264_chroma_mc8.
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
23 * H264_CHROMA_MC8_TMPL must be defined to the desired function name
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
24 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
25 * AVG_OP must be defined to empty for put and the identify for avg
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
26 */
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
27 static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd)
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
28 {
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
29 if(y==0 && x==0) {
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
30 /* no filter needed */
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
31 H264_CHROMA_MC8_MV0(dst, src, stride, h);
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
32 return;
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
33 }
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
34
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
35 assert(x<8 && y<8 && x>=0 && y>=0);
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
36
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
37 if(y==0 || x==0)
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
38 {
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
39 /* 1 dimensional filter only */
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
40 asm volatile(
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
41 "movd %0, %%xmm7 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
42 "movq %1, %%xmm6 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
43 "pshuflw $0, %%xmm7, %%xmm7 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
44 "movlhps %%xmm6, %%xmm6 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
45 "movlhps %%xmm7, %%xmm7 \n\t"
6575
d869966e57e5 Fix H.264 interframe decoding when compiling with icc. Patch by Loren
melanson
parents: 6557
diff changeset
46 :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3))
d869966e57e5 Fix H.264 interframe decoding when compiling with icc. Patch by Loren
melanson
parents: 6557
diff changeset
47 );
6557
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
48
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
49 if(x) {
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
50 asm volatile(
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
51 "1: \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
52 "movq (%1), %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
53 "movq 1(%1), %%xmm1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
54 "movq (%1,%3), %%xmm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
55 "movq 1(%1,%3), %%xmm3 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
56 "punpcklbw %%xmm1, %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
57 "punpcklbw %%xmm3, %%xmm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
58 "pmaddubsw %%xmm7, %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
59 "pmaddubsw %%xmm7, %%xmm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
60 AVG_OP("movq (%0), %%xmm4 \n\t")
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
61 AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
62 "paddw %%xmm6, %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
63 "paddw %%xmm6, %%xmm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
64 "psrlw $3, %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
65 "psrlw $3, %%xmm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
66 "packuswb %%xmm2, %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
67 AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
68 "movq %%xmm0, (%0) \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
69 "movhps %%xmm0, (%0,%3) \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
70 "sub $2, %2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
71 "lea (%1,%3,2), %1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
72 "lea (%0,%3,2), %0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
73 "jg 1b \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
74 :"+r"(dst), "+r"(src), "+r"(h)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6575
diff changeset
75 :"r"((x86_reg)stride)
6557
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
76 );
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
77 } else {
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
78 asm volatile(
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
79 "1: \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
80 "movq (%1), %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
81 "movq (%1,%3), %%xmm1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
82 "movdqa %%xmm1, %%xmm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
83 "movq (%1,%3,2), %%xmm3 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
84 "punpcklbw %%xmm1, %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
85 "punpcklbw %%xmm3, %%xmm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
86 "pmaddubsw %%xmm7, %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
87 "pmaddubsw %%xmm7, %%xmm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
88 AVG_OP("movq (%0), %%xmm4 \n\t")
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
89 AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
90 "paddw %%xmm6, %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
91 "paddw %%xmm6, %%xmm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
92 "psrlw $3, %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
93 "psrlw $3, %%xmm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
94 "packuswb %%xmm2, %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
95 AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
96 "movq %%xmm0, (%0) \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
97 "movhps %%xmm0, (%0,%3) \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
98 "sub $2, %2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
99 "lea (%1,%3,2), %1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
100 "lea (%0,%3,2), %0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
101 "jg 1b \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
102 :"+r"(dst), "+r"(src), "+r"(h)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6575
diff changeset
103 :"r"((x86_reg)stride)
6557
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
104 );
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
105 }
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
106 return;
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
107 }
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
108
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
109 /* general case, bilinear */
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
110 asm volatile(
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
111 "movd %0, %%xmm7 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
112 "movd %1, %%xmm6 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
113 "movdqa %2, %%xmm5 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
114 "pshuflw $0, %%xmm7, %%xmm7 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
115 "pshuflw $0, %%xmm6, %%xmm6 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
116 "movlhps %%xmm7, %%xmm7 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
117 "movlhps %%xmm6, %%xmm6 \n\t"
6575
d869966e57e5 Fix H.264 interframe decoding when compiling with icc. Patch by Loren
melanson
parents: 6557
diff changeset
118 :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28))
6557
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
119 );
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
120
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
121 asm volatile(
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
122 "movq (%1), %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
123 "movq 1(%1), %%xmm1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
124 "punpcklbw %%xmm1, %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
125 "add %3, %1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
126 "1: \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
127 "movq (%1), %%xmm1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
128 "movq 1(%1), %%xmm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
129 "movq (%1,%3), %%xmm3 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
130 "movq 1(%1,%3), %%xmm4 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
131 "lea (%1,%3,2), %1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
132 "punpcklbw %%xmm2, %%xmm1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
133 "punpcklbw %%xmm4, %%xmm3 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
134 "movdqa %%xmm1, %%xmm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
135 "movdqa %%xmm3, %%xmm4 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
136 "pmaddubsw %%xmm7, %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
137 "pmaddubsw %%xmm6, %%xmm1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
138 "pmaddubsw %%xmm7, %%xmm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
139 "pmaddubsw %%xmm6, %%xmm3 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
140 "paddw %%xmm5, %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
141 "paddw %%xmm5, %%xmm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
142 "paddw %%xmm0, %%xmm1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
143 "paddw %%xmm2, %%xmm3 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
144 "movdqa %%xmm4, %%xmm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
145 "psrlw $6, %%xmm1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
146 "psrlw $6, %%xmm3 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
147 AVG_OP("movq (%0), %%xmm2 \n\t")
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
148 AVG_OP("movhps (%0,%3), %%xmm2 \n\t")
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
149 "packuswb %%xmm3, %%xmm1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
150 AVG_OP("pavgb %%xmm2, %%xmm1 \n\t")
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
151 "movq %%xmm1, (%0)\n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
152 "movhps %%xmm1, (%0,%3)\n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
153 "sub $2, %2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
154 "lea (%0,%3,2), %0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
155 "jg 1b \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
156 :"+r"(dst), "+r"(src), "+r"(h)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6575
diff changeset
157 :"r"((x86_reg)stride)
6557
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
158 );
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
159 }
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
160
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
161 static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
162 {
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
163 asm volatile(
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
164 "movd %0, %%mm7 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
165 "movd %1, %%mm6 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
166 "movq %2, %%mm5 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
167 "pshufw $0, %%mm7, %%mm7 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
168 "pshufw $0, %%mm6, %%mm6 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
169 :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32)
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
170 );
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
171
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
172 asm volatile(
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
173 "movd (%1), %%mm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
174 "punpcklbw 1(%1), %%mm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
175 "add %3, %1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
176 "1: \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
177 "movd (%1), %%mm1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
178 "movd (%1,%3), %%mm3 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
179 "punpcklbw 1(%1), %%mm1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
180 "punpcklbw 1(%1,%3), %%mm3 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
181 "lea (%1,%3,2), %1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
182 "movq %%mm1, %%mm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
183 "movq %%mm3, %%mm4 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
184 "pmaddubsw %%mm7, %%mm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
185 "pmaddubsw %%mm6, %%mm1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
186 "pmaddubsw %%mm7, %%mm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
187 "pmaddubsw %%mm6, %%mm3 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
188 "paddw %%mm5, %%mm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
189 "paddw %%mm5, %%mm2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
190 "paddw %%mm0, %%mm1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
191 "paddw %%mm2, %%mm3 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
192 "movq %%mm4, %%mm0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
193 "psrlw $6, %%mm1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
194 "psrlw $6, %%mm3 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
195 "packuswb %%mm1, %%mm1 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
196 "packuswb %%mm3, %%mm3 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
197 AVG_OP("pavgb (%0), %%mm1 \n\t")
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
198 AVG_OP("pavgb (%0,%3), %%mm3 \n\t")
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
199 "movd %%mm1, (%0)\n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
200 "movd %%mm3, (%0,%3)\n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
201 "sub $2, %2 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
202 "lea (%0,%3,2), %0 \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
203 "jg 1b \n\t"
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
204 :"+r"(dst), "+r"(src), "+r"(h)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6575
diff changeset
205 :"r"((x86_reg)stride)
6557
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
206 );
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
207 }
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents:
diff changeset
208