Mercurial > libavcodec.hg
annotate x86/vc1dsp_yasm.asm @ 12512:58bd8cb86f5d libavcodec
Remove unused variable.
author | rbultje |
---|---|
date | Fri, 24 Sep 2010 15:31:46 +0000 |
parents | 2982071047a2 |
children |
rev | line source |
---|---|
12144 | 1 ;****************************************************************************** |
2 ;* VC1 deblocking optimizations | |
3 ;* Copyright (c) 2009 David Conrad | |
4 ;* | |
5 ;* This file is part of FFmpeg. | |
6 ;* | |
7 ;* FFmpeg is free software; you can redistribute it and/or | |
8 ;* modify it under the terms of the GNU Lesser General Public | |
9 ;* License as published by the Free Software Foundation; either | |
10 ;* version 2.1 of the License, or (at your option) any later version. | |
11 ;* | |
12 ;* FFmpeg is distributed in the hope that it will be useful, | |
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 ;* Lesser General Public License for more details. | |
16 ;* | |
17 ;* You should have received a copy of the GNU Lesser General Public | |
18 ;* License along with FFmpeg; if not, write to the Free Software | |
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 ;****************************************************************************** | |
21 | |
22 %include "x86inc.asm" | |
23 %include "x86util.asm" | |
24 | |
25 cextern pw_4 | |
26 cextern pw_5 | |
27 | |
28 section .text | |
29 | |
30 ; dst_low, dst_high (src), zero | |
31 ; zero-extends one vector from 8 to 16 bits | |
32 %macro UNPACK_8TO16 4 | |
33 mova m%2, m%3 | |
34 punpckh%1 m%3, m%4 | |
35 punpckl%1 m%2, m%4 | |
36 %endmacro | |
37 | |
38 %macro STORE_4_WORDS_MMX 6 | |
12457
2982071047a2
Use "d" suffix for general-purpose registers used with movd.
reimar
parents:
12144
diff
changeset
|
39 movd %6d, %5 |
12144 | 40 %if mmsize==16 |
41 psrldq %5, 4 | |
42 %else | |
43 psrlq %5, 32 | |
44 %endif | |
45 mov %1, %6w | |
46 shr %6, 16 | |
47 mov %2, %6w | |
12457
2982071047a2
Use "d" suffix for general-purpose registers used with movd.
reimar
parents:
12144
diff
changeset
|
48 movd %6d, %5 |
12144 | 49 mov %3, %6w |
50 shr %6, 16 | |
51 mov %4, %6w | |
52 %endmacro | |
53 | |
54 %macro STORE_4_WORDS_SSE4 6 | |
55 pextrw %1, %5, %6+0 | |
56 pextrw %2, %5, %6+1 | |
57 pextrw %3, %5, %6+2 | |
58 pextrw %4, %5, %6+3 | |
59 %endmacro | |
60 | |
61 ; in: p1 p0 q0 q1, clobbers p0 | |
62 ; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3 | |
63 %macro VC1_LOOP_FILTER_A0 4 | |
64 psubw %1, %4 | |
65 psubw %2, %3 | |
66 paddw %1, %1 | |
67 pmullw %2, [pw_5] | |
68 psubw %1, %2 | |
69 paddw %1, [pw_4] | |
70 psraw %1, 3 | |
71 %endmacro | |
72 | |
73 ; in: p0 q0 a0 a1 a2 | |
74 ; m0 m1 m7 m6 m5 | |
75 ; %1: size | |
76 ; out: m0=p0' m1=q0' | |
77 %macro VC1_FILTER 1 | |
78 PABSW m4, m7 | |
79 PABSW m3, m6 | |
80 PABSW m2, m5 | |
81 mova m6, m4 | |
82 pminsw m3, m2 | |
83 pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0) | |
84 psubw m3, m4 | |
85 pmullw m3, [pw_5] ; 5*(a3 - a0) | |
86 PABSW m2, m3 | |
87 psraw m2, 3 ; abs(d/8) | |
88 pxor m7, m3 ; d_sign ^= a0_sign | |
89 | |
90 pxor m5, m5 | |
12457
2982071047a2
Use "d" suffix for general-purpose registers used with movd.
reimar
parents:
12144
diff
changeset
|
91 movd m3, r2d |
12144 | 92 %if %1 > 4 |
93 punpcklbw m3, m3 | |
94 %endif | |
95 punpcklbw m3, m5 | |
96 pcmpgtw m3, m4 ; if (a0 < pq) | |
97 pand m6, m3 | |
98 | |
99 mova m3, m0 | |
100 psubw m3, m1 | |
101 PABSW m4, m3 | |
102 psraw m4, 1 | |
103 pxor m3, m7 ; d_sign ^ clip_sign | |
104 psraw m3, 15 | |
105 pminsw m2, m4 ; min(d, clip) | |
106 pcmpgtw m4, m5 | |
107 pand m6, m4 ; filt3 (C return value) | |
108 | |
109 ; each set of 4 pixels is not filtered if the 3rd is not | |
110 %if mmsize==16 | |
111 pshuflw m4, m6, 0xaa | |
112 %if %1 > 4 | |
113 pshufhw m4, m4, 0xaa | |
114 %endif | |
115 %else | |
116 pshufw m4, m6, 0xaa | |
117 %endif | |
118 pandn m3, m4 | |
119 pand m2, m6 | |
120 pand m3, m2 ; d final | |
121 | |
122 PSIGNW m3, m7 | |
123 psubw m0, m3 | |
124 paddw m1, m3 | |
125 packuswb m0, m0 | |
126 packuswb m1, m1 | |
127 %endmacro | |
128 | |
129 ; 1st param: size of filter | |
130 ; 2nd param: mov suffix equivalent to the filter size | |
131 %macro VC1_V_LOOP_FILTER 2 | |
132 pxor m5, m5 | |
133 mov%2 m6, [r4] | |
134 mov%2 m4, [r4+r1] | |
135 mov%2 m7, [r4+2*r1] | |
136 mov%2 m0, [r4+r3] | |
137 punpcklbw m6, m5 | |
138 punpcklbw m4, m5 | |
139 punpcklbw m7, m5 | |
140 punpcklbw m0, m5 | |
141 | |
142 VC1_LOOP_FILTER_A0 m6, m4, m7, m0 | |
143 mov%2 m1, [r0] | |
144 mov%2 m2, [r0+r1] | |
145 punpcklbw m1, m5 | |
146 punpcklbw m2, m5 | |
147 mova m4, m0 | |
148 VC1_LOOP_FILTER_A0 m7, m4, m1, m2 | |
149 mov%2 m3, [r0+2*r1] | |
150 mov%2 m4, [r0+r3] | |
151 punpcklbw m3, m5 | |
152 punpcklbw m4, m5 | |
153 mova m5, m1 | |
154 VC1_LOOP_FILTER_A0 m5, m2, m3, m4 | |
155 | |
156 VC1_FILTER %1 | |
157 mov%2 [r4+r3], m0 | |
158 mov%2 [r0], m1 | |
159 %endmacro | |
160 | |
161 ; 1st param: size of filter | |
162 ; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register | |
163 ; 2nd (optional) param: temp register to use for storing words | |
164 %macro VC1_H_LOOP_FILTER 1-2 | |
165 %if %1 == 4 | |
166 movq m0, [r0 -4] | |
167 movq m1, [r0+ r1-4] | |
168 movq m2, [r0+2*r1-4] | |
169 movq m3, [r0+ r3-4] | |
170 TRANSPOSE4x4B 0, 1, 2, 3, 4 | |
171 %else | |
172 movq m0, [r0 -4] | |
173 movq m4, [r0+ r1-4] | |
174 movq m1, [r0+2*r1-4] | |
175 movq m5, [r0+ r3-4] | |
176 movq m2, [r4 -4] | |
177 movq m6, [r4+ r1-4] | |
178 movq m3, [r4+2*r1-4] | |
179 movq m7, [r4+ r3-4] | |
180 punpcklbw m0, m4 | |
181 punpcklbw m1, m5 | |
182 punpcklbw m2, m6 | |
183 punpcklbw m3, m7 | |
184 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
185 %endif | |
186 pxor m5, m5 | |
187 | |
188 UNPACK_8TO16 bw, 6, 0, 5 | |
189 UNPACK_8TO16 bw, 7, 1, 5 | |
190 VC1_LOOP_FILTER_A0 m6, m0, m7, m1 | |
191 UNPACK_8TO16 bw, 4, 2, 5 | |
192 mova m0, m1 ; m0 = p0 | |
193 VC1_LOOP_FILTER_A0 m7, m1, m4, m2 | |
194 UNPACK_8TO16 bw, 1, 3, 5 | |
195 mova m5, m4 | |
196 VC1_LOOP_FILTER_A0 m5, m2, m1, m3 | |
197 SWAP 1, 4 ; m1 = q0 | |
198 | |
199 VC1_FILTER %1 | |
200 punpcklbw m0, m1 | |
201 %if %0 > 1 | |
202 STORE_4_WORDS_MMX [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2 | |
203 %if %1 > 4 | |
204 psrldq m0, 4 | |
205 STORE_4_WORDS_MMX [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2 | |
206 %endif | |
207 %else | |
208 STORE_4_WORDS_SSE4 [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0 | |
209 STORE_4_WORDS_SSE4 [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4 | |
210 %endif | |
211 %endmacro | |
212 | |
213 | |
214 %macro START_V_FILTER 0 | |
215 mov r4, r0 | |
216 lea r3, [4*r1] | |
217 sub r4, r3 | |
218 lea r3, [r1+2*r1] | |
219 imul r2, 0x01010101 | |
220 %endmacro | |
221 | |
222 %macro START_H_FILTER 1 | |
223 lea r3, [r1+2*r1] | |
224 %if %1 > 4 | |
225 lea r4, [r0+4*r1] | |
226 %endif | |
227 imul r2, 0x01010101 | |
228 %endmacro | |
229 | |
230 ; I dont know why the sign extension is needed... | |
231 %macro PSIGNW_SRA_MMX 2 | |
232 psraw %2, 15 | |
233 PSIGNW_MMX %1, %2 | |
234 %endmacro | |
235 | |
236 | |
237 %macro VC1_LF_MMX 1 | |
238 INIT_MMX | |
239 cglobal vc1_v_loop_filter_internal_%1 | |
240 VC1_V_LOOP_FILTER 4, d | |
241 ret | |
242 | |
243 cglobal vc1_h_loop_filter_internal_%1 | |
244 VC1_H_LOOP_FILTER 4, r4 | |
245 ret | |
246 | |
247 ; void ff_vc1_v_loop_filter4_mmx2(uint8_t *src, int stride, int pq) | |
248 cglobal vc1_v_loop_filter4_%1, 3,5,0 | |
249 START_V_FILTER | |
250 call vc1_v_loop_filter_internal_%1 | |
251 RET | |
252 | |
253 ; void ff_vc1_h_loop_filter4_mmx2(uint8_t *src, int stride, int pq) | |
254 cglobal vc1_h_loop_filter4_%1, 3,5,0 | |
255 START_H_FILTER 4 | |
256 call vc1_h_loop_filter_internal_%1 | |
257 RET | |
258 | |
259 ; void ff_vc1_v_loop_filter8_mmx2(uint8_t *src, int stride, int pq) | |
260 cglobal vc1_v_loop_filter8_%1, 3,5,0 | |
261 START_V_FILTER | |
262 call vc1_v_loop_filter_internal_%1 | |
263 add r4, 4 | |
264 add r0, 4 | |
265 call vc1_v_loop_filter_internal_%1 | |
266 RET | |
267 | |
268 ; void ff_vc1_h_loop_filter8_mmx2(uint8_t *src, int stride, int pq) | |
269 cglobal vc1_h_loop_filter8_%1, 3,5,0 | |
270 START_H_FILTER 4 | |
271 call vc1_h_loop_filter_internal_%1 | |
272 lea r0, [r0+4*r1] | |
273 call vc1_h_loop_filter_internal_%1 | |
274 RET | |
275 %endmacro | |
276 | |
277 %define PABSW PABSW_MMX | |
278 %define PSIGNW PSIGNW_SRA_MMX | |
279 VC1_LF_MMX mmx | |
280 | |
281 %define PABSW PABSW_MMX2 | |
282 VC1_LF_MMX mmx2 | |
283 | |
284 INIT_XMM | |
285 ; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq) | |
286 cglobal vc1_v_loop_filter8_sse2, 3,5,8 | |
287 START_V_FILTER | |
288 VC1_V_LOOP_FILTER 8, q | |
289 RET | |
290 | |
291 ; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq) | |
292 cglobal vc1_h_loop_filter8_sse2, 3,6,8 | |
293 START_H_FILTER 8 | |
294 VC1_H_LOOP_FILTER 8, r5 | |
295 RET | |
296 | |
297 %define PABSW PABSW_SSSE3 | |
298 %define PSIGNW PSIGNW_SSSE3 | |
299 | |
300 INIT_MMX | |
301 ; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq) | |
302 cglobal vc1_v_loop_filter4_ssse3, 3,5,0 | |
303 START_V_FILTER | |
304 VC1_V_LOOP_FILTER 4, d | |
305 RET | |
306 | |
307 ; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq) | |
308 cglobal vc1_h_loop_filter4_ssse3, 3,5,0 | |
309 START_H_FILTER 4 | |
310 VC1_H_LOOP_FILTER 4, r4 | |
311 RET | |
312 | |
313 INIT_XMM | |
314 ; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq) | |
315 cglobal vc1_v_loop_filter8_ssse3, 3,5,8 | |
316 START_V_FILTER | |
317 VC1_V_LOOP_FILTER 8, q | |
318 RET | |
319 | |
320 ; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq) | |
321 cglobal vc1_h_loop_filter8_ssse3, 3,6,8 | |
322 START_H_FILTER 8 | |
323 VC1_H_LOOP_FILTER 8, r5 | |
324 RET | |
325 | |
326 ; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq) | |
327 cglobal vc1_h_loop_filter8_sse4, 3,5,8 | |
328 START_H_FILTER 8 | |
329 VC1_H_LOOP_FILTER 8 | |
330 RET |