comparison i386/dsputil_h264_template_ssse3.c @ 6557:e1208c4f8898 libavcodec

h264 chroma mc ssse3 width8: 180->92, width4: 78->63 cycles (core2)
author lorenm
date Tue, 01 Apr 2008 04:51:28 +0000
parents
children d869966e57e5
comparison
equal deleted inserted replaced
6556:8300baeb2b5f 6557:e1208c4f8898
1 /*
2 * Copyright (c) 2008 Loren Merritt
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 /**
22 * SSSE3 optimized version of (put|avg)_h264_chroma_mc8.
23 * H264_CHROMA_MC8_TMPL must be defined to the desired function name
24 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
25 * AVG_OP must be defined to empty for put and the identify for avg
26 */
27 static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd)
28 {
29 if(y==0 && x==0) {
30 /* no filter needed */
31 H264_CHROMA_MC8_MV0(dst, src, stride, h);
32 return;
33 }
34
35 assert(x<8 && y<8 && x>=0 && y>=0);
36
37 if(y==0 || x==0)
38 {
39 /* 1 dimensional filter only */
40 asm volatile(
41 "movd %0, %%xmm7 \n\t"
42 "movq %1, %%xmm6 \n\t"
43 "pshuflw $0, %%xmm7, %%xmm7 \n\t"
44 "movlhps %%xmm6, %%xmm6 \n\t"
45 "movlhps %%xmm7, %%xmm7 \n\t"
46 :: "r"(255*(x+y)+8), "m"(rnd?ff_pw_4:ff_pw_3));
47
48 if(x) {
49 asm volatile(
50 "1: \n\t"
51 "movq (%1), %%xmm0 \n\t"
52 "movq 1(%1), %%xmm1 \n\t"
53 "movq (%1,%3), %%xmm2 \n\t"
54 "movq 1(%1,%3), %%xmm3 \n\t"
55 "punpcklbw %%xmm1, %%xmm0 \n\t"
56 "punpcklbw %%xmm3, %%xmm2 \n\t"
57 "pmaddubsw %%xmm7, %%xmm0 \n\t"
58 "pmaddubsw %%xmm7, %%xmm2 \n\t"
59 AVG_OP("movq (%0), %%xmm4 \n\t")
60 AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
61 "paddw %%xmm6, %%xmm0 \n\t"
62 "paddw %%xmm6, %%xmm2 \n\t"
63 "psrlw $3, %%xmm0 \n\t"
64 "psrlw $3, %%xmm2 \n\t"
65 "packuswb %%xmm2, %%xmm0 \n\t"
66 AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
67 "movq %%xmm0, (%0) \n\t"
68 "movhps %%xmm0, (%0,%3) \n\t"
69 "sub $2, %2 \n\t"
70 "lea (%1,%3,2), %1 \n\t"
71 "lea (%0,%3,2), %0 \n\t"
72 "jg 1b \n\t"
73 :"+r"(dst), "+r"(src), "+r"(h)
74 :"r"((long)stride)
75 );
76 } else {
77 asm volatile(
78 "1: \n\t"
79 "movq (%1), %%xmm0 \n\t"
80 "movq (%1,%3), %%xmm1 \n\t"
81 "movdqa %%xmm1, %%xmm2 \n\t"
82 "movq (%1,%3,2), %%xmm3 \n\t"
83 "punpcklbw %%xmm1, %%xmm0 \n\t"
84 "punpcklbw %%xmm3, %%xmm2 \n\t"
85 "pmaddubsw %%xmm7, %%xmm0 \n\t"
86 "pmaddubsw %%xmm7, %%xmm2 \n\t"
87 AVG_OP("movq (%0), %%xmm4 \n\t")
88 AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
89 "paddw %%xmm6, %%xmm0 \n\t"
90 "paddw %%xmm6, %%xmm2 \n\t"
91 "psrlw $3, %%xmm0 \n\t"
92 "psrlw $3, %%xmm2 \n\t"
93 "packuswb %%xmm2, %%xmm0 \n\t"
94 AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
95 "movq %%xmm0, (%0) \n\t"
96 "movhps %%xmm0, (%0,%3) \n\t"
97 "sub $2, %2 \n\t"
98 "lea (%1,%3,2), %1 \n\t"
99 "lea (%0,%3,2), %0 \n\t"
100 "jg 1b \n\t"
101 :"+r"(dst), "+r"(src), "+r"(h)
102 :"r"((long)stride)
103 );
104 }
105 return;
106 }
107
108 /* general case, bilinear */
109 asm volatile(
110 "movd %0, %%xmm7 \n\t"
111 "movd %1, %%xmm6 \n\t"
112 "movdqa %2, %%xmm5 \n\t"
113 "pshuflw $0, %%xmm7, %%xmm7 \n\t"
114 "pshuflw $0, %%xmm6, %%xmm6 \n\t"
115 "movlhps %%xmm7, %%xmm7 \n\t"
116 "movlhps %%xmm6, %%xmm6 \n\t"
117 :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(rnd?ff_pw_32:ff_pw_28)
118 );
119
120 asm volatile(
121 "movq (%1), %%xmm0 \n\t"
122 "movq 1(%1), %%xmm1 \n\t"
123 "punpcklbw %%xmm1, %%xmm0 \n\t"
124 "add %3, %1 \n\t"
125 "1: \n\t"
126 "movq (%1), %%xmm1 \n\t"
127 "movq 1(%1), %%xmm2 \n\t"
128 "movq (%1,%3), %%xmm3 \n\t"
129 "movq 1(%1,%3), %%xmm4 \n\t"
130 "lea (%1,%3,2), %1 \n\t"
131 "punpcklbw %%xmm2, %%xmm1 \n\t"
132 "punpcklbw %%xmm4, %%xmm3 \n\t"
133 "movdqa %%xmm1, %%xmm2 \n\t"
134 "movdqa %%xmm3, %%xmm4 \n\t"
135 "pmaddubsw %%xmm7, %%xmm0 \n\t"
136 "pmaddubsw %%xmm6, %%xmm1 \n\t"
137 "pmaddubsw %%xmm7, %%xmm2 \n\t"
138 "pmaddubsw %%xmm6, %%xmm3 \n\t"
139 "paddw %%xmm5, %%xmm0 \n\t"
140 "paddw %%xmm5, %%xmm2 \n\t"
141 "paddw %%xmm0, %%xmm1 \n\t"
142 "paddw %%xmm2, %%xmm3 \n\t"
143 "movdqa %%xmm4, %%xmm0 \n\t"
144 "psrlw $6, %%xmm1 \n\t"
145 "psrlw $6, %%xmm3 \n\t"
146 AVG_OP("movq (%0), %%xmm2 \n\t")
147 AVG_OP("movhps (%0,%3), %%xmm2 \n\t")
148 "packuswb %%xmm3, %%xmm1 \n\t"
149 AVG_OP("pavgb %%xmm2, %%xmm1 \n\t")
150 "movq %%xmm1, (%0)\n\t"
151 "movhps %%xmm1, (%0,%3)\n\t"
152 "sub $2, %2 \n\t"
153 "lea (%0,%3,2), %0 \n\t"
154 "jg 1b \n\t"
155 :"+r"(dst), "+r"(src), "+r"(h)
156 :"r"((long)stride)
157 );
158 }
159
160 static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
161 {
162 asm volatile(
163 "movd %0, %%mm7 \n\t"
164 "movd %1, %%mm6 \n\t"
165 "movq %2, %%mm5 \n\t"
166 "pshufw $0, %%mm7, %%mm7 \n\t"
167 "pshufw $0, %%mm6, %%mm6 \n\t"
168 :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32)
169 );
170
171 asm volatile(
172 "movd (%1), %%mm0 \n\t"
173 "punpcklbw 1(%1), %%mm0 \n\t"
174 "add %3, %1 \n\t"
175 "1: \n\t"
176 "movd (%1), %%mm1 \n\t"
177 "movd (%1,%3), %%mm3 \n\t"
178 "punpcklbw 1(%1), %%mm1 \n\t"
179 "punpcklbw 1(%1,%3), %%mm3 \n\t"
180 "lea (%1,%3,2), %1 \n\t"
181 "movq %%mm1, %%mm2 \n\t"
182 "movq %%mm3, %%mm4 \n\t"
183 "pmaddubsw %%mm7, %%mm0 \n\t"
184 "pmaddubsw %%mm6, %%mm1 \n\t"
185 "pmaddubsw %%mm7, %%mm2 \n\t"
186 "pmaddubsw %%mm6, %%mm3 \n\t"
187 "paddw %%mm5, %%mm0 \n\t"
188 "paddw %%mm5, %%mm2 \n\t"
189 "paddw %%mm0, %%mm1 \n\t"
190 "paddw %%mm2, %%mm3 \n\t"
191 "movq %%mm4, %%mm0 \n\t"
192 "psrlw $6, %%mm1 \n\t"
193 "psrlw $6, %%mm3 \n\t"
194 "packuswb %%mm1, %%mm1 \n\t"
195 "packuswb %%mm3, %%mm3 \n\t"
196 AVG_OP("pavgb (%0), %%mm1 \n\t")
197 AVG_OP("pavgb (%0,%3), %%mm3 \n\t")
198 "movd %%mm1, (%0)\n\t"
199 "movd %%mm3, (%0,%3)\n\t"
200 "sub $2, %2 \n\t"
201 "lea (%0,%3,2), %0 \n\t"
202 "jg 1b \n\t"
203 :"+r"(dst), "+r"(src), "+r"(h)
204 :"r"((long)stride)
205 );
206 }
207