Mercurial > libavcodec.hg
comparison i386/dsputil_h264_template_ssse3.c @ 6557:e1208c4f8898 libavcodec
h264 chroma mc ssse3
width8: 180->92, width4: 78->63 cycles (core2)
author | lorenm |
---|---|
date | Tue, 01 Apr 2008 04:51:28 +0000 |
parents | |
children | d869966e57e5 |
comparison
equal
deleted
inserted
replaced
6556:8300baeb2b5f | 6557:e1208c4f8898 |
---|---|
1 /* | |
2 * Copyright (c) 2008 Loren Merritt | |
3 * | |
4 * This file is part of FFmpeg. | |
5 * | |
6 * FFmpeg is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2.1 of the License, or (at your option) any later version. | |
10 * | |
11 * FFmpeg is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with FFmpeg; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 */ | |
20 | |
21 /** | |
22 * SSSE3 optimized version of (put|avg)_h264_chroma_mc8. | |
23 * H264_CHROMA_MC8_TMPL must be defined to the desired function name | |
24 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function | |
25 * AVG_OP must be defined to empty for put and the identify for avg | |
26 */ | |
27 static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) | |
28 { | |
29 if(y==0 && x==0) { | |
30 /* no filter needed */ | |
31 H264_CHROMA_MC8_MV0(dst, src, stride, h); | |
32 return; | |
33 } | |
34 | |
35 assert(x<8 && y<8 && x>=0 && y>=0); | |
36 | |
37 if(y==0 || x==0) | |
38 { | |
39 /* 1 dimensional filter only */ | |
40 asm volatile( | |
41 "movd %0, %%xmm7 \n\t" | |
42 "movq %1, %%xmm6 \n\t" | |
43 "pshuflw $0, %%xmm7, %%xmm7 \n\t" | |
44 "movlhps %%xmm6, %%xmm6 \n\t" | |
45 "movlhps %%xmm7, %%xmm7 \n\t" | |
46 :: "r"(255*(x+y)+8), "m"(rnd?ff_pw_4:ff_pw_3)); | |
47 | |
48 if(x) { | |
49 asm volatile( | |
50 "1: \n\t" | |
51 "movq (%1), %%xmm0 \n\t" | |
52 "movq 1(%1), %%xmm1 \n\t" | |
53 "movq (%1,%3), %%xmm2 \n\t" | |
54 "movq 1(%1,%3), %%xmm3 \n\t" | |
55 "punpcklbw %%xmm1, %%xmm0 \n\t" | |
56 "punpcklbw %%xmm3, %%xmm2 \n\t" | |
57 "pmaddubsw %%xmm7, %%xmm0 \n\t" | |
58 "pmaddubsw %%xmm7, %%xmm2 \n\t" | |
59 AVG_OP("movq (%0), %%xmm4 \n\t") | |
60 AVG_OP("movhps (%0,%3), %%xmm4 \n\t") | |
61 "paddw %%xmm6, %%xmm0 \n\t" | |
62 "paddw %%xmm6, %%xmm2 \n\t" | |
63 "psrlw $3, %%xmm0 \n\t" | |
64 "psrlw $3, %%xmm2 \n\t" | |
65 "packuswb %%xmm2, %%xmm0 \n\t" | |
66 AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") | |
67 "movq %%xmm0, (%0) \n\t" | |
68 "movhps %%xmm0, (%0,%3) \n\t" | |
69 "sub $2, %2 \n\t" | |
70 "lea (%1,%3,2), %1 \n\t" | |
71 "lea (%0,%3,2), %0 \n\t" | |
72 "jg 1b \n\t" | |
73 :"+r"(dst), "+r"(src), "+r"(h) | |
74 :"r"((long)stride) | |
75 ); | |
76 } else { | |
77 asm volatile( | |
78 "1: \n\t" | |
79 "movq (%1), %%xmm0 \n\t" | |
80 "movq (%1,%3), %%xmm1 \n\t" | |
81 "movdqa %%xmm1, %%xmm2 \n\t" | |
82 "movq (%1,%3,2), %%xmm3 \n\t" | |
83 "punpcklbw %%xmm1, %%xmm0 \n\t" | |
84 "punpcklbw %%xmm3, %%xmm2 \n\t" | |
85 "pmaddubsw %%xmm7, %%xmm0 \n\t" | |
86 "pmaddubsw %%xmm7, %%xmm2 \n\t" | |
87 AVG_OP("movq (%0), %%xmm4 \n\t") | |
88 AVG_OP("movhps (%0,%3), %%xmm4 \n\t") | |
89 "paddw %%xmm6, %%xmm0 \n\t" | |
90 "paddw %%xmm6, %%xmm2 \n\t" | |
91 "psrlw $3, %%xmm0 \n\t" | |
92 "psrlw $3, %%xmm2 \n\t" | |
93 "packuswb %%xmm2, %%xmm0 \n\t" | |
94 AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") | |
95 "movq %%xmm0, (%0) \n\t" | |
96 "movhps %%xmm0, (%0,%3) \n\t" | |
97 "sub $2, %2 \n\t" | |
98 "lea (%1,%3,2), %1 \n\t" | |
99 "lea (%0,%3,2), %0 \n\t" | |
100 "jg 1b \n\t" | |
101 :"+r"(dst), "+r"(src), "+r"(h) | |
102 :"r"((long)stride) | |
103 ); | |
104 } | |
105 return; | |
106 } | |
107 | |
108 /* general case, bilinear */ | |
109 asm volatile( | |
110 "movd %0, %%xmm7 \n\t" | |
111 "movd %1, %%xmm6 \n\t" | |
112 "movdqa %2, %%xmm5 \n\t" | |
113 "pshuflw $0, %%xmm7, %%xmm7 \n\t" | |
114 "pshuflw $0, %%xmm6, %%xmm6 \n\t" | |
115 "movlhps %%xmm7, %%xmm7 \n\t" | |
116 "movlhps %%xmm6, %%xmm6 \n\t" | |
117 :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(rnd?ff_pw_32:ff_pw_28) | |
118 ); | |
119 | |
120 asm volatile( | |
121 "movq (%1), %%xmm0 \n\t" | |
122 "movq 1(%1), %%xmm1 \n\t" | |
123 "punpcklbw %%xmm1, %%xmm0 \n\t" | |
124 "add %3, %1 \n\t" | |
125 "1: \n\t" | |
126 "movq (%1), %%xmm1 \n\t" | |
127 "movq 1(%1), %%xmm2 \n\t" | |
128 "movq (%1,%3), %%xmm3 \n\t" | |
129 "movq 1(%1,%3), %%xmm4 \n\t" | |
130 "lea (%1,%3,2), %1 \n\t" | |
131 "punpcklbw %%xmm2, %%xmm1 \n\t" | |
132 "punpcklbw %%xmm4, %%xmm3 \n\t" | |
133 "movdqa %%xmm1, %%xmm2 \n\t" | |
134 "movdqa %%xmm3, %%xmm4 \n\t" | |
135 "pmaddubsw %%xmm7, %%xmm0 \n\t" | |
136 "pmaddubsw %%xmm6, %%xmm1 \n\t" | |
137 "pmaddubsw %%xmm7, %%xmm2 \n\t" | |
138 "pmaddubsw %%xmm6, %%xmm3 \n\t" | |
139 "paddw %%xmm5, %%xmm0 \n\t" | |
140 "paddw %%xmm5, %%xmm2 \n\t" | |
141 "paddw %%xmm0, %%xmm1 \n\t" | |
142 "paddw %%xmm2, %%xmm3 \n\t" | |
143 "movdqa %%xmm4, %%xmm0 \n\t" | |
144 "psrlw $6, %%xmm1 \n\t" | |
145 "psrlw $6, %%xmm3 \n\t" | |
146 AVG_OP("movq (%0), %%xmm2 \n\t") | |
147 AVG_OP("movhps (%0,%3), %%xmm2 \n\t") | |
148 "packuswb %%xmm3, %%xmm1 \n\t" | |
149 AVG_OP("pavgb %%xmm2, %%xmm1 \n\t") | |
150 "movq %%xmm1, (%0)\n\t" | |
151 "movhps %%xmm1, (%0,%3)\n\t" | |
152 "sub $2, %2 \n\t" | |
153 "lea (%0,%3,2), %0 \n\t" | |
154 "jg 1b \n\t" | |
155 :"+r"(dst), "+r"(src), "+r"(h) | |
156 :"r"((long)stride) | |
157 ); | |
158 } | |
159 | |
160 static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) | |
161 { | |
162 asm volatile( | |
163 "movd %0, %%mm7 \n\t" | |
164 "movd %1, %%mm6 \n\t" | |
165 "movq %2, %%mm5 \n\t" | |
166 "pshufw $0, %%mm7, %%mm7 \n\t" | |
167 "pshufw $0, %%mm6, %%mm6 \n\t" | |
168 :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32) | |
169 ); | |
170 | |
171 asm volatile( | |
172 "movd (%1), %%mm0 \n\t" | |
173 "punpcklbw 1(%1), %%mm0 \n\t" | |
174 "add %3, %1 \n\t" | |
175 "1: \n\t" | |
176 "movd (%1), %%mm1 \n\t" | |
177 "movd (%1,%3), %%mm3 \n\t" | |
178 "punpcklbw 1(%1), %%mm1 \n\t" | |
179 "punpcklbw 1(%1,%3), %%mm3 \n\t" | |
180 "lea (%1,%3,2), %1 \n\t" | |
181 "movq %%mm1, %%mm2 \n\t" | |
182 "movq %%mm3, %%mm4 \n\t" | |
183 "pmaddubsw %%mm7, %%mm0 \n\t" | |
184 "pmaddubsw %%mm6, %%mm1 \n\t" | |
185 "pmaddubsw %%mm7, %%mm2 \n\t" | |
186 "pmaddubsw %%mm6, %%mm3 \n\t" | |
187 "paddw %%mm5, %%mm0 \n\t" | |
188 "paddw %%mm5, %%mm2 \n\t" | |
189 "paddw %%mm0, %%mm1 \n\t" | |
190 "paddw %%mm2, %%mm3 \n\t" | |
191 "movq %%mm4, %%mm0 \n\t" | |
192 "psrlw $6, %%mm1 \n\t" | |
193 "psrlw $6, %%mm3 \n\t" | |
194 "packuswb %%mm1, %%mm1 \n\t" | |
195 "packuswb %%mm3, %%mm3 \n\t" | |
196 AVG_OP("pavgb (%0), %%mm1 \n\t") | |
197 AVG_OP("pavgb (%0,%3), %%mm3 \n\t") | |
198 "movd %%mm1, (%0)\n\t" | |
199 "movd %%mm3, (%0,%3)\n\t" | |
200 "sub $2, %2 \n\t" | |
201 "lea (%0,%3,2), %0 \n\t" | |
202 "jg 1b \n\t" | |
203 :"+r"(dst), "+r"(src), "+r"(h) | |
204 :"r"((long)stride) | |
205 ); | |
206 } | |
207 |