annotate x86/vc1dsp_yasm.asm @ 12506:747e5f278c4b libavcodec

The debug text output of macroblocks can indicate MB_TYPE_INTERLACED, but it used to do it only for h264 codec. Allow it for other codecs, as mpeg2 and mpeg4 also set this flag.
author iive
date Tue, 21 Sep 2010 22:44:27 +0000
parents 2982071047a2
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
12144
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
1 ;******************************************************************************
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
2 ;* VC1 deblocking optimizations
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
3 ;* Copyright (c) 2009 David Conrad
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
4 ;*
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
5 ;* This file is part of FFmpeg.
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
6 ;*
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
7 ;* FFmpeg is free software; you can redistribute it and/or
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
8 ;* modify it under the terms of the GNU Lesser General Public
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
9 ;* License as published by the Free Software Foundation; either
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
10 ;* version 2.1 of the License, or (at your option) any later version.
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
11 ;*
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
12 ;* FFmpeg is distributed in the hope that it will be useful,
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
15 ;* Lesser General Public License for more details.
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
16 ;*
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
17 ;* You should have received a copy of the GNU Lesser General Public
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
18 ;* License along with FFmpeg; if not, write to the Free Software
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
20 ;******************************************************************************
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
21
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
22 %include "x86inc.asm"
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
23 %include "x86util.asm"
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
24
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
25 cextern pw_4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
26 cextern pw_5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
27
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
28 section .text
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
29
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
30 ; dst_low, dst_high (src), zero
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
31 ; zero-extends one vector from 8 to 16 bits
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
32 %macro UNPACK_8TO16 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
33 mova m%2, m%3
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
34 punpckh%1 m%3, m%4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
35 punpckl%1 m%2, m%4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
36 %endmacro
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
37
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
38 %macro STORE_4_WORDS_MMX 6
12457
2982071047a2 Use "d" suffix for general-purpose registers used with movd.
reimar
parents: 12144
diff changeset
39 movd %6d, %5
12144
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
40 %if mmsize==16
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
41 psrldq %5, 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
42 %else
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
43 psrlq %5, 32
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
44 %endif
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
45 mov %1, %6w
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
46 shr %6, 16
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
47 mov %2, %6w
12457
2982071047a2 Use "d" suffix for general-purpose registers used with movd.
reimar
parents: 12144
diff changeset
48 movd %6d, %5
12144
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
49 mov %3, %6w
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
50 shr %6, 16
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
51 mov %4, %6w
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
52 %endmacro
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
53
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
54 %macro STORE_4_WORDS_SSE4 6
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
55 pextrw %1, %5, %6+0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
56 pextrw %2, %5, %6+1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
57 pextrw %3, %5, %6+2
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
58 pextrw %4, %5, %6+3
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
59 %endmacro
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
60
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
61 ; in: p1 p0 q0 q1, clobbers p0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
62 ; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
63 %macro VC1_LOOP_FILTER_A0 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
64 psubw %1, %4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
65 psubw %2, %3
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
66 paddw %1, %1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
67 pmullw %2, [pw_5]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
68 psubw %1, %2
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
69 paddw %1, [pw_4]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
70 psraw %1, 3
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
71 %endmacro
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
72
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
73 ; in: p0 q0 a0 a1 a2
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
74 ; m0 m1 m7 m6 m5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
75 ; %1: size
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
76 ; out: m0=p0' m1=q0'
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
77 %macro VC1_FILTER 1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
78 PABSW m4, m7
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
79 PABSW m3, m6
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
80 PABSW m2, m5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
81 mova m6, m4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
82 pminsw m3, m2
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
83 pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
84 psubw m3, m4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
85 pmullw m3, [pw_5] ; 5*(a3 - a0)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
86 PABSW m2, m3
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
87 psraw m2, 3 ; abs(d/8)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
88 pxor m7, m3 ; d_sign ^= a0_sign
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
89
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
90 pxor m5, m5
12457
2982071047a2 Use "d" suffix for general-purpose registers used with movd.
reimar
parents: 12144
diff changeset
91 movd m3, r2d
12144
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
92 %if %1 > 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
93 punpcklbw m3, m3
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
94 %endif
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
95 punpcklbw m3, m5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
96 pcmpgtw m3, m4 ; if (a0 < pq)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
97 pand m6, m3
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
98
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
99 mova m3, m0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
100 psubw m3, m1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
101 PABSW m4, m3
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
102 psraw m4, 1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
103 pxor m3, m7 ; d_sign ^ clip_sign
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
104 psraw m3, 15
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
105 pminsw m2, m4 ; min(d, clip)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
106 pcmpgtw m4, m5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
107 pand m6, m4 ; filt3 (C return value)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
108
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
109 ; each set of 4 pixels is not filtered if the 3rd is not
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
110 %if mmsize==16
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
111 pshuflw m4, m6, 0xaa
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
112 %if %1 > 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
113 pshufhw m4, m4, 0xaa
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
114 %endif
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
115 %else
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
116 pshufw m4, m6, 0xaa
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
117 %endif
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
118 pandn m3, m4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
119 pand m2, m6
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
120 pand m3, m2 ; d final
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
121
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
122 PSIGNW m3, m7
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
123 psubw m0, m3
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
124 paddw m1, m3
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
125 packuswb m0, m0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
126 packuswb m1, m1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
127 %endmacro
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
128
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
129 ; 1st param: size of filter
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
130 ; 2nd param: mov suffix equivalent to the filter size
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
131 %macro VC1_V_LOOP_FILTER 2
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
132 pxor m5, m5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
133 mov%2 m6, [r4]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
134 mov%2 m4, [r4+r1]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
135 mov%2 m7, [r4+2*r1]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
136 mov%2 m0, [r4+r3]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
137 punpcklbw m6, m5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
138 punpcklbw m4, m5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
139 punpcklbw m7, m5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
140 punpcklbw m0, m5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
141
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
142 VC1_LOOP_FILTER_A0 m6, m4, m7, m0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
143 mov%2 m1, [r0]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
144 mov%2 m2, [r0+r1]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
145 punpcklbw m1, m5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
146 punpcklbw m2, m5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
147 mova m4, m0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
148 VC1_LOOP_FILTER_A0 m7, m4, m1, m2
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
149 mov%2 m3, [r0+2*r1]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
150 mov%2 m4, [r0+r3]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
151 punpcklbw m3, m5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
152 punpcklbw m4, m5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
153 mova m5, m1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
154 VC1_LOOP_FILTER_A0 m5, m2, m3, m4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
155
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
156 VC1_FILTER %1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
157 mov%2 [r4+r3], m0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
158 mov%2 [r0], m1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
159 %endmacro
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
160
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
161 ; 1st param: size of filter
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
162 ; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
163 ; 2nd (optional) param: temp register to use for storing words
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
164 %macro VC1_H_LOOP_FILTER 1-2
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
165 %if %1 == 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
166 movq m0, [r0 -4]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
167 movq m1, [r0+ r1-4]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
168 movq m2, [r0+2*r1-4]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
169 movq m3, [r0+ r3-4]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
170 TRANSPOSE4x4B 0, 1, 2, 3, 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
171 %else
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
172 movq m0, [r0 -4]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
173 movq m4, [r0+ r1-4]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
174 movq m1, [r0+2*r1-4]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
175 movq m5, [r0+ r3-4]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
176 movq m2, [r4 -4]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
177 movq m6, [r4+ r1-4]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
178 movq m3, [r4+2*r1-4]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
179 movq m7, [r4+ r3-4]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
180 punpcklbw m0, m4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
181 punpcklbw m1, m5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
182 punpcklbw m2, m6
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
183 punpcklbw m3, m7
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
184 TRANSPOSE4x4W 0, 1, 2, 3, 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
185 %endif
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
186 pxor m5, m5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
187
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
188 UNPACK_8TO16 bw, 6, 0, 5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
189 UNPACK_8TO16 bw, 7, 1, 5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
190 VC1_LOOP_FILTER_A0 m6, m0, m7, m1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
191 UNPACK_8TO16 bw, 4, 2, 5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
192 mova m0, m1 ; m0 = p0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
193 VC1_LOOP_FILTER_A0 m7, m1, m4, m2
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
194 UNPACK_8TO16 bw, 1, 3, 5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
195 mova m5, m4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
196 VC1_LOOP_FILTER_A0 m5, m2, m1, m3
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
197 SWAP 1, 4 ; m1 = q0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
198
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
199 VC1_FILTER %1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
200 punpcklbw m0, m1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
201 %if %0 > 1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
202 STORE_4_WORDS_MMX [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
203 %if %1 > 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
204 psrldq m0, 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
205 STORE_4_WORDS_MMX [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
206 %endif
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
207 %else
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
208 STORE_4_WORDS_SSE4 [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
209 STORE_4_WORDS_SSE4 [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
210 %endif
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
211 %endmacro
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
212
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
213
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
214 %macro START_V_FILTER 0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
215 mov r4, r0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
216 lea r3, [4*r1]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
217 sub r4, r3
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
218 lea r3, [r1+2*r1]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
219 imul r2, 0x01010101
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
220 %endmacro
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
221
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
222 %macro START_H_FILTER 1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
223 lea r3, [r1+2*r1]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
224 %if %1 > 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
225 lea r4, [r0+4*r1]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
226 %endif
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
227 imul r2, 0x01010101
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
228 %endmacro
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
229
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
230 ; I dont know why the sign extension is needed...
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
231 %macro PSIGNW_SRA_MMX 2
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
232 psraw %2, 15
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
233 PSIGNW_MMX %1, %2
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
234 %endmacro
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
235
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
236
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
237 %macro VC1_LF_MMX 1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
238 INIT_MMX
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
239 cglobal vc1_v_loop_filter_internal_%1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
240 VC1_V_LOOP_FILTER 4, d
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
241 ret
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
242
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
243 cglobal vc1_h_loop_filter_internal_%1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
244 VC1_H_LOOP_FILTER 4, r4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
245 ret
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
246
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
247 ; void ff_vc1_v_loop_filter4_mmx2(uint8_t *src, int stride, int pq)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
248 cglobal vc1_v_loop_filter4_%1, 3,5,0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
249 START_V_FILTER
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
250 call vc1_v_loop_filter_internal_%1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
251 RET
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
252
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
253 ; void ff_vc1_h_loop_filter4_mmx2(uint8_t *src, int stride, int pq)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
254 cglobal vc1_h_loop_filter4_%1, 3,5,0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
255 START_H_FILTER 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
256 call vc1_h_loop_filter_internal_%1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
257 RET
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
258
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
259 ; void ff_vc1_v_loop_filter8_mmx2(uint8_t *src, int stride, int pq)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
260 cglobal vc1_v_loop_filter8_%1, 3,5,0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
261 START_V_FILTER
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
262 call vc1_v_loop_filter_internal_%1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
263 add r4, 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
264 add r0, 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
265 call vc1_v_loop_filter_internal_%1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
266 RET
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
267
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
268 ; void ff_vc1_h_loop_filter8_mmx2(uint8_t *src, int stride, int pq)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
269 cglobal vc1_h_loop_filter8_%1, 3,5,0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
270 START_H_FILTER 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
271 call vc1_h_loop_filter_internal_%1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
272 lea r0, [r0+4*r1]
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
273 call vc1_h_loop_filter_internal_%1
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
274 RET
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
275 %endmacro
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
276
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
277 %define PABSW PABSW_MMX
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
278 %define PSIGNW PSIGNW_SRA_MMX
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
279 VC1_LF_MMX mmx
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
280
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
281 %define PABSW PABSW_MMX2
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
282 VC1_LF_MMX mmx2
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
283
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
284 INIT_XMM
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
285 ; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
286 cglobal vc1_v_loop_filter8_sse2, 3,5,8
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
287 START_V_FILTER
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
288 VC1_V_LOOP_FILTER 8, q
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
289 RET
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
290
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
291 ; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
292 cglobal vc1_h_loop_filter8_sse2, 3,6,8
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
293 START_H_FILTER 8
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
294 VC1_H_LOOP_FILTER 8, r5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
295 RET
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
296
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
297 %define PABSW PABSW_SSSE3
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
298 %define PSIGNW PSIGNW_SSSE3
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
299
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
300 INIT_MMX
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
301 ; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
302 cglobal vc1_v_loop_filter4_ssse3, 3,5,0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
303 START_V_FILTER
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
304 VC1_V_LOOP_FILTER 4, d
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
305 RET
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
306
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
307 ; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
308 cglobal vc1_h_loop_filter4_ssse3, 3,5,0
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
309 START_H_FILTER 4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
310 VC1_H_LOOP_FILTER 4, r4
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
311 RET
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
312
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
313 INIT_XMM
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
314 ; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
315 cglobal vc1_v_loop_filter8_ssse3, 3,5,8
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
316 START_V_FILTER
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
317 VC1_V_LOOP_FILTER 8, q
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
318 RET
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
319
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
320 ; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
321 cglobal vc1_h_loop_filter8_ssse3, 3,6,8
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
322 START_H_FILTER 8
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
323 VC1_H_LOOP_FILTER 8, r5
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
324 RET
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
325
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
326 ; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
327 cglobal vc1_h_loop_filter8_sse4, 3,5,8
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
328 START_H_FILTER 8
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
329 VC1_H_LOOP_FILTER 8
846779f6b164 MMX/SSE VC1 loop filter
conrad
parents:
diff changeset
330 RET